0

I have a model, with 35 features. After transforming the string values into vectors, I end up with a transformed model with 54 features.

After training I want to evaluate the feature weight, however I end up with much more features (104, so twice) than available. An "index out of bounds" on finding the corresponding feature column is the result.

PFI

var featureImportanceMetrics =
                permutationFeatureImportance
                    .Select((metric, index) => new { index, metric.RSquared })
                    .OrderByDescending(myFeatures => Math.Abs(myFeatures.RSquared.Mean));

Anyone can tell me what I am doing wrong, or did I just hit a defect in the ML.Net library?

(note I have a few algorithms commented, because I wanted to playh with BinaryClassification but couldn't get it to run properly without errors)

The code

 public class Employee
        {
            [LoadColumn(0)]
            public float Age { get; set; }
            [LoadColumn(1)]
            [ColumnName("Label")]
            public float Attrition { get; set; }
            [LoadColumn(2)]
            public string BusinessTravel { get; set; }
            [LoadColumn(3)]
            public float DailyRate { get; set; }
            [LoadColumn(4)]
            public string Department { get; set; }
            [LoadColumn(5)]
            public float DistanceFromHome { get; set; }
            [LoadColumn(6)]
            public float Education { get; set; }
            [LoadColumn(7)]
            public string EducationField { get; set; }
            [LoadColumn(8)]
            public float EmployeeCount { get; set; }
            [LoadColumn(9)]
            public float EmployeeNumber { get; set; }
            [LoadColumn(10)]
            public float EnvironmentSatisfaction { get; set; }
            [LoadColumn(11)]
            public string Gender { get; set; }
            [LoadColumn(12)]
            public float HourlyRate { get; set; }
            [LoadColumn(13)]
            public float JobInvolvement { get; set; }
            [LoadColumn(14)]
            public float JobLevel { get; set; }
            [LoadColumn(15)]
            public string JobRole { get; set; }
            [LoadColumn(16)]
            public float JobSatisfaction { get; set; }
            [LoadColumn(17)]
            public string MaritalStatus { get; set; }
            [LoadColumn(18)]
            public float MonthlyIncome { get; set; }
            [LoadColumn(19)]
            public float MonthlyRate { get; set; }
            [LoadColumn(20)]
            public float NumCompaniesWorked { get; set; }
            [LoadColumn(21)]
            public string Over18 { get; set; }
            [LoadColumn(22)]
            public string OverTime { get; set; }
            [LoadColumn(23)]
            public float PercentSalaryHike { get; set; }
            [LoadColumn(24)]
            public float PerformanceRating { get; set; }
            [LoadColumn(25)]
            public float RelationshipSatisfaction { get; set; }
            [LoadColumn(26)]
            public float StandardHours { get; set; }
            [LoadColumn(27)]
            public float StockOptionLevel { get; set; }
            [LoadColumn(28)]
            public float TotalWorkingYears { get; set; }
            [LoadColumn(29)]
            public float TrainingTimesLastYear { get; set; }
            [LoadColumn(30)]
            public float WorkLifeBalance { get; set; }
            [LoadColumn(31)]
            public float YearsAtCompany { get; set; }
            [LoadColumn(32)]
            public float YearsInCurrentRole { get; set; }
            [LoadColumn(33)]
            public float YearsSinceLastPromotion { get; set; }
            [LoadColumn(34)]
            public float YearsWithCurrManager { get; set; }
        }
        public class EmployeeTransformed
        {
            public float Age { get; set; }
            [ColumnName("Label")]
            public float Attrition { get; set; }
            public float[] BusinessTravel { get; set; }
            public float DailyRate { get; set; }
            public float[] Department { get; set; }
            public float DistanceFromHome { get; set; }
            public float Education { get; set; }
            public float[] EducationField { get; set; }
            public float EmployeeCount { get; set; }
            public float EmployeeNumber { get; set; }
            public float EnvironmentSatisfaction { get; set; }
            public float[] Gender { get; set; }
            public float HourlyRate { get; set; }
            public float JobInvolvement { get; set; }
            public float JobLevel { get; set; }
            public float[] JobRole { get; set; }
            public float JobSatisfaction { get; set; }
            public float[] MaritalStatus { get; set; }
            public float MonthlyIncome { get; set; }
            public float MonthlyRate { get; set; }
            public float NumCompaniesWorked { get; set; }
            public float[] Over18 { get; set; }
            public float[] OverTime { get; set; }
            public float PercentSalaryHike { get; set; }
            public float PerformanceRating { get; set; }
            public float RelationshipSatisfaction { get; set; }
            public float StandardHours { get; set; }
            public float StockOptionLevel { get; set; }
            public float TotalWorkingYears { get; set; }
            public float TrainingTimesLastYear { get; set; }
            public float WorkLifeBalance { get; set; }
            public float YearsAtCompany { get; set; }
            public float YearsInCurrentRole { get; set; }
            public float YearsSinceLastPromotion { get; set; }
            public float YearsWithCurrManager { get; set; }

        }

        public ActionResult Turnover()
        {
            MLContext mlContext = new MLContext();

            var _appPath = AppDomain.CurrentDomain.BaseDirectory;
            //var _dataPath = Path.Combine(_appPath, "Datasets", "WA_Fn-UseC_-HR-Employee-Attrition.csv");
            var _dataPath = Path.Combine(_appPath, "Datasets", "attrition_small_dataset.csv");

            // Load data from file
            IDataView dataView = mlContext.Data.LoadFromTextFile<Employee>(_dataPath, separatorChar: ',', hasHeader: true);
            var a = mlContext.Data.CreateEnumerable<Employee>(dataView, true).ToList();

            // Define categorical transform estimator
            var categoricalEstimator = mlContext.Transforms.Categorical.OneHotEncoding("BusinessTravel")
            .Append(mlContext.Transforms.Categorical.OneHotEncoding("Department"))
            .Append(mlContext.Transforms.Categorical.OneHotEncoding("EducationField"))
            .Append(mlContext.Transforms.Categorical.OneHotEncoding("Gender"))
            .Append(mlContext.Transforms.Categorical.OneHotEncoding("JobRole"))
            .Append(mlContext.Transforms.Categorical.OneHotEncoding("MaritalStatus"))
            .Append(mlContext.Transforms.Categorical.OneHotEncoding("Over18"))
            .Append(mlContext.Transforms.Categorical.OneHotEncoding("OverTime"));
            IDataView transformedData = categoricalEstimator.Fit(dataView).Transform(dataView);

            // Split into train and test dataset
            DataOperationsCatalog.TrainTestData dataSplit = mlContext.Data.TrainTestSplit(transformedData, testFraction: 0.2);
            IDataView trainData = dataSplit.TrainSet;
            IDataView testData = dataSplit.TestSet;

            // Get the column names of input features.
            string[] featureColumnNames =
                trainData.Schema
                    .Select(column => column.Name)
                    .Where(columnName => columnName != "Label").ToArray();

            // Define estimator with data pre-processing steps
            IEstimator<ITransformer> dataPrepEstimator =
                mlContext.Transforms.Concatenate("Features", featureColumnNames)
                    .Append(mlContext.Transforms.NormalizeMinMax("Features"));

            IDataView preprocessedTrainData = dataPrepEstimator.Fit(trainData).Transform(trainData);
            var e = mlContext.Data.CreateEnumerable<EmployeeTransformed>(preprocessedTrainData, true).ToList();

            /*
            //  Define Stochastic Dual Coordinate Ascent machine learning estimator
            //var sdcaEstimator = mlContext.Regression.Trainers.Sdca(labelColumnName: "Age", featureColumnName: "Features");
            //var sdcaEstimator = mlContext.Regression.Trainers.Sdca(labelColumnName: "Attrition", maximumNumberOfIterations: 100);
            //var sdcaEstimator = mlContext.BinaryClassification.Trainers.FastTree(labelColumnName : "Attrition", featureColumnName : "Features", numberOfLeaves: 50, numberOfTrees: 50, minimumExampleCountPerLeaf: 20);
            */
            var sdcaEstimator = mlContext.Regression.Trainers.Sdca();

            // Train machine learning model
            var sdcaModel = sdcaEstimator.Fit(preprocessedTrainData);

            // Explain the model with Permutation Feature Importance (PFI)
            ImmutableArray<RegressionMetricsStatistics> permutationFeatureImportance =
            mlContext
                .Regression
                .PermutationFeatureImportance(sdcaModel, preprocessedTrainData, permutationCount: 3);

            // Order features by importance
            var featureImportanceMetrics =
                permutationFeatureImportance
                    .Select((metric, index) => new { index, metric.RSquared })
                    .OrderByDescending(myFeatures => Math.Abs(myFeatures.RSquared.Mean));

            var line = "Feature\tPFI <br>";

            var z = featureColumnNames;
            foreach (var feature in featureImportanceMetrics)
            {
                line += $"{featureColumnNames[feature.index],-20}|\t{feature.RSquared.Mean:F6} <br>";
            }

            return Content(line);
        }
molbdnilo
  • 64,751
  • 3
  • 43
  • 82
Micha Schopman
  • 221
  • 2
  • 14

1 Answers1

0

I solved my issue, hopefully this helps others as well. The ML.net documentation is mostly focussing on the happy flow and the various changes to the framework left various code samples that are not up to date anymore.

After I transformed the content I use the transformed schema instead of the original schema. I have to filter out label, feature columns. For every string property ML.net transforms this into a Int32 and a Vector (float[]) but with the property "IsHidden" so you can filter out those as well and that leaves you with the properties you need.

var categoricalEstimator = mlContext.Transforms.Categorical.OneHotEncoding("BusinessTravel");
dataView = categoricalEstimator.Fit(dataView).Transform(dataView);

DataOperationsCatalog.TrainTestData dataSplit = mlContext.Data.TrainTestSplit(dataView, testFraction: 0.2);
IDataView trainData = dataSplit.TrainSet;
IDataView testData = dataSplit.TestSet;

var pipeline = mlContext.Transforms.Concatenate("Features", featureColumns)
.Append(mlContext.Transforms.NormalizeMinMax("Features"))                    .Append(mlContext.BinaryClassification.Trainers.SdcaLogisticRegression());

var model = pipeline.Fit(trainData);
var transformedData = model.Transform(trainData);
var linearPredictor = model.LastTransformer;

var permutationMetrics = mlContext.BinaryClassification.PermutationFeatureImportance(linearPredictor, transformedData, permutationCount: 30);

var sortedIndices = permutationMetrics.Select((metrics, index) => new { index, metrics.AreaUnderRocCurve })
                .OrderByDescending(feature => Math.Abs(feature.AreaUnderRocCurve.Mean))
                .Select(feature => feature.index);


            var sb = new System.Text.StringBuilder();

            // Calculate metrics of the model on the test data.
            var trainedModelMetrics = mlContext.BinaryClassification.Evaluate(model.Transform(testData), labelColumnName: "Label");

            sb.Append("<h1>Binary Classification Model, Predicting Employee Turnover</h1>");
            sb.Append(String.Format("<h3>Accuracy:{0}</h3>",trainedModelMetrics.Accuracy));
            sb.Append(String.Format("<h3>F1Score:{0}</h3>", trainedModelMetrics.F1Score));

            sb.Append("<table border=1><thead><tr><th>Feature</th><th>Model Weight</th><th>Change in AUC</th><th>95% Confidence in the Mean Change in AUC</th></tr></thead><tbody>");
            var auc = permutationMetrics.Select(x => x.AreaUnderRocCurve).ToArray();

            foreach (int i in sortedIndices)
            {
                if (transformedData.Schema[i].IsHidden || transformedData.Schema[i].Name == "Label" || transformedData.Schema[i].Name == "Features")
                {
                    continue;
                }

                var s = String.Format("<tr><td>{0}</td><td>{1:0.00}</td><td>{2:G4}</td><td>{3:G4}</td></tr>",
                    transformedData.Schema[i].Name,
                    linearPredictor.Model.SubModel.Weights[i],
                    auc[i].Mean,
                    1.96 * auc[i].StandardError);
                sb.Append(s);
            }
            sb.Append("</tbody></table>");



            return Content(sb.ToString());
Micha Schopman
  • 221
  • 2
  • 14