ToFeatures方法依据商业活动的数量,生成所需的特征数组。
private static float[] ToFeatures(ILookup<string, int> lookup, int count) { var result = new float[count]; foreach (var item in lookup) { var key = Convert.ToInt32(item.Key) - 1; result[key] = item.Sum(); } return result; } 数据视图取得用于生成视图的数组后,这里使用CreateStreamingDataView方法构建数据视图。而又因为Features属性是一个数组,所以必须声明其大小。
var mlContext = new MLContext(); var schemaDef = SchemaDefinition.Create(typeof(PivotData)); schemaDef["Features"].ColumnType = new VectorType(NumberType.R4, count); var pivotDataView = mlContext.CreateStreamingDataView(pivotDataArray, schemaDef); PCAPCA(principal Component Analysis),主成分分析,是为了将过多的维度值减少至一个合适的范围以便于分析,这里是降到二维空间。
new PrincipalComponentAnalysisEstimator(mlContext, "Features", "PCAFeatures", rank: 2) OneHotEncodingOne Hot Encoding在此处的作用是将LastName从字符串转换为数字矩阵。
new OneHotEncodingEstimator(mlContext, new[] { new OneHotEncodingEstimator.ColumnInfo("LastName", "LastNameKey", OneHotEncodingTransformer.OutputKind.Ind) }) 训练器K-Means是常用的应对聚类问题的训练器,这里假设要分为三类。
mlContext.Clustering.Trainers.KMeans("Features", clustersCount: 3) 训练模型 trainingPipeline.Fit(pivotDataView); 评估模型 var predictions = trainedModel.Transform(pivotDataView); var metrics = mlContext.Clustering.Evaluate(predictions, score: "Score", features: "Features"); Console.WriteLine($"*************************************************"); Console.WriteLine($"* Metrics for {trainer} clustering model "); Console.WriteLine($"*------------------------------------------------"); Console.WriteLine($"* AvgMinScore: {metrics.AvgMinScore}"); Console.WriteLine($"* DBI is: {metrics.Dbi}"); Console.WriteLine($"*************************************************");可得到如下的评估结果。
************************************************* * Metrics for Microsoft.ML.Trainers.KMeans.KMeansPlusPlusTrainer clustering model *------------------------------------------------ * AvgMinScore: 2.3154067927599 * DBI is: 2.69100740819456 ************************************************* 使用模型 var clusteringPredictions = predictions .AsEnumerable<ClusteringPrediction>(mlContext, false) .ToArray(); Plot为了更直观地观察,可以用OxyPlot类库生成结果图片。
添加类库:
dotnet add package OxyPlot.CorePlot生成处理:
var plot = new PlotModel { Title = "Customer Segmentation", IsLegendVisible = true }; var clusters = clusteringPredictions.Select(p => p.SelectedClusterId).Distinct().OrderBy(x => x); foreach (var cluster in clusters) { var scatter = new ScatterSeries { MarkerType = MarkerType.Circle, MarkerStrokeThickness = 2, Title = $"Cluster: {cluster}", RenderInLegend = true }; var series = clusteringPredictions .Where(p => p.SelectedClusterId == cluster) .Select(p => new ScatterPoint(p.Location[0], p.Location[1])).ToArray(); scatter.Points.AddRange(series); plot.Series.Add(scatter); } plot.DefaultColors = OxyPalettes.HueDistinct(plot.Series.Count).Colors; var exporter = new SvgExporter { Width = 600, Height = 400 }; using (var fs = new System.IO.FileStream(_plotSvg, System.IO.FileMode.Create)) { exporter.Export(plot, fs); }最后的图片如下所示:
Program类:
using CustomerSegmentation.DataStructures; using Microsoft.ML; using System; using System.IO; using System.Linq; using Microsoft.ML.Runtime.Api; using Microsoft.ML.Transforms.Projections; using Microsoft.ML.Transforms.Categorical; using Microsoft.ML.Runtime.Data; using OxyPlot; using OxyPlot.Series; using Microsoft.ML.Core.Data; namespace CustomerSegmentation { class Program { private static float[] ToFeatures(ILookup<string, int> lookup, int count) { var result = new float[count]; foreach (var item in lookup) { var key = Convert.ToInt32(item.Key) - 1; result[key] = item.Sum(); } return result; } static readonly string _offersCsv = Path.Combine(Environment.CurrentDirectory, "assets", "offers.csv"); static readonly string _transactionsCsv = Path.Combine(Environment.CurrentDirectory, "assets", "transactions.csv"); static readonly string _plotSvg = Path.Combine(Environment.CurrentDirectory, "assets", "customerSegmentation.svg"); static void Main(string[] args) { var offers = Offer.ReadFromCsv(_offersCsv); var transactions = Transaction.ReadFromCsv(_transactionsCsv); var clusterData = (from of in offers join tr in transactions on of.OfferId equals tr.OfferId select new { of.OfferId, of.Campaign, of.Discount, tr.LastName, of.LastPeak, of.Minimum, of.Origin, of.Varietal, Count = 1, }).ToArray(); var count = offers.Count(); var pivotDataArray = (from c in clusterData group c by c.LastName into gcs let lookup = gcs.ToLookup(y => y.OfferId, y => y.Count) select new PivotData() { LastName = gcs.Key, Features = ToFeatures(lookup, count) }).ToArray(); var mlContext = new MLContext(); var schemaDef = SchemaDefinition.Create(typeof(PivotData)); schemaDef["Features"].ColumnType = new VectorType(NumberType.R4, count); var pivotDataView = mlContext.CreateStreamingDataView(pivotDataArray, schemaDef); var dataProcessPipeline = new PrincipalComponentAnalysisEstimator(mlContext, "Features", "PCAFeatures", rank: 2) .Append(new OneHotEncodingEstimator(mlContext, new[] { new OneHotEncodingEstimator.ColumnInfo("LastName", "LastNameKey", OneHotEncodingTransformer.OutputKind.Ind) })); var trainer = mlContext.Clustering.Trainers.KMeans("Features", clustersCount: 3); var trainingPipeline = dataProcessPipeline.Append(trainer); ITransformer trainedModel = trainingPipeline.Fit(pivotDataView); var predictions = trainedModel.Transform(pivotDataView); var metrics = mlContext.Clustering.Evaluate(predictions, score: "Score", features: "Features"); Console.WriteLine($"*************************************************"); Console.WriteLine($"* Metrics for {trainer} clustering model "); Console.WriteLine($"*------------------------------------------------"); Console.WriteLine($"* AvgMinScore: {metrics.AvgMinScore}"); Console.WriteLine($"* DBI is: {metrics.Dbi}"); Console.WriteLine($"*************************************************"); var clusteringPredictions = predictions .AsEnumerable<ClusteringPrediction>(mlContext, false) .ToArray(); var plot = new PlotModel { Title = "Customer Segmentation", IsLegendVisible = true }; var clusters = clusteringPredictions.Select(p => p.SelectedClusterId).Distinct().OrderBy(x => x); foreach (var cluster in clusters) { var scatter = new ScatterSeries { MarkerType = MarkerType.Circle, MarkerStrokeThickness = 2, Title = $"Cluster: {cluster}", RenderInLegend = true }; var series = clusteringPredictions .Where(p => p.SelectedClusterId == cluster) .Select(p => new ScatterPoint(p.Location[0], p.Location[1])).ToArray(); scatter.Points.AddRange(series); plot.Series.Add(scatter); } plot.DefaultColors = OxyPalettes.HueDistinct(plot.Series.Count).Colors; var exporter = new SvgExporter { Width = 600, Height = 400 }; using (var fs = new System.IO.FileStream(_plotSvg, System.IO.FileMode.Create)) { exporter.Export(plot, fs); } Console.Read(); } } }