Simple sentiment analysis using ml.net and IEnumerable dataview

Question

I was testing a simplest example to learn sentiment analysis using custom IEnumerable dataview instead traditional data load from text files. I created a list of TestData and TrainingData with some example review to easily learn by following the sample available on github and documentation. But something is missing and the model I created is not working correctly...it's just giving wrong result as positive for everything.

Main

private static string ModelPath = @"C:\ML\SentimentModel.zip";

void Main()
{
    var mlContext = new MLContext(seed: 1);
    var trainingData = GetTrainingData();
    var testData = GetTestData();

    BuildTrainEvaluateAndSaveModel(mlContext, trainingData, testData);
    TestPrediction(mlContext);
}

Testing and Training

private static ITransformer BuildTrainEvaluateAndSaveModel(MLContext mlContext, List<SentimentData> trainingData, List<SentimentData> testData)
{
    // STEP 1: Common data loading configuration
    IDataView trainingDataView = mlContext.Data.ReadFromEnumerable(trainingData);
    IDataView testDataView = mlContext.Data.ReadFromEnumerable(trainingData);

    // STEP 2: Common data process configuration with pipeline data transformations          
    var dataProcessPipeline = mlContext.Transforms.Text.FeaturizeText(outputColumnName: DefaultColumnNames.Features, inputColumnName: nameof(SentimentData.Text));

    // STEP 3: Set the training algorithm, then create and config the modelBuilder                            
    var trainer = mlContext.BinaryClassification.Trainers.FastTree(labelColumn: DefaultColumnNames.Label, featureColumn: DefaultColumnNames.Features);
    var trainingPipeline = dataProcessPipeline.Append(trainer);

    // STEP 4: Train the model fitting to the DataSet
    Console.WriteLine("=============== Training the model ===============");
    ITransformer trainedModel = trainingPipeline.Fit(trainingDataView);

    // STEP 6: Save/persist the trained model to a .ZIP file

    using (var fs = new FileStream(ModelPath, FileMode.Create, FileAccess.Write, FileShare.Write))
        mlContext.Model.Save(trainedModel, fs);

    Console.WriteLine("The model is saved to {0}", ModelPath);

    return trainedModel;
}

private void TestPrediction(MLContext mlContext)
{
    var testData = GetTestData();
    ITransformer trainedModel;
    using (var stream = new FileStream(ModelPath, FileMode.Open, FileAccess.Read, FileShare.Read))
    {
        trainedModel = mlContext.Model.Load(stream);
    }
    var engine = trainedModel.CreatePredictionEngine<SentimentData, SentimentPrediction>(mlContext);
    foreach(var test in testData)
    {
        var result = engine.Predict(test);
        Console.WriteLine($"Prediction : {(Convert.ToBoolean(result.Prediction) ? "Negative" : "Postive")} | Actual: {test.Expected} | Text : {test.Text}");
    }
}

Models and Training/Test data

public List<SentimentData> GetTrainingData()
{
    return new List<SentimentData>
            {
                new SentimentData
                {
                    Label = true,
                    Text = "Good service."
                },
                new SentimentData
                {
                    Label = true,
                    Text = "Very good service"
                },
                new SentimentData
                {
                    Label = true,
                    Text = "Amazing service"
                },
                new SentimentData
                {
                    Label = true,
                    Text = "Great staff, will visit again. thanks for the gift"
                },
                new SentimentData
                {
                    Label = false,
                    Text = "Bad staff, bad service. Will never visit this hotel"
                },
                new SentimentData
                {
                    Label = false,
                    Text = "The service was very bad"
                },
                new SentimentData
                {
                    Label = false,
                    Text = "Hotel location is worst"
                }
            };
}

public List<SentimentData> GetTestData()
{
    return new List<SentimentData>
            {
                new SentimentData
                {
                    Label = true,
                    Text = "Worst hotel in New York",
                    Expected = "Negative"
                },
                new SentimentData
                {
                    Label = true,
                    Text = "I ordered pizza and recieved Wine. Bad staff",
                    Expected = "Negative"
                },
                new SentimentData
                {
                    Label = true,
                    Text = "The hotel was so amazing, and they givena bag to me on gift",
                    Expected = "Positive"
                },
                new SentimentData
                {
                    Label = true,
                    Text = "The hotel staff was great, will visit again",
                    Expected = "Positive"
                }
            };
}

public class SentimentData
{
    public bool Label { get; set; }
    public string Text { get; set; }

    // Additional property for testing purpose
    public string Expected {get; set;}
}

public class SentimentPrediction
{
    [ColumnName("PredictedLabel")]
    public bool Prediction { get; set; }
    public float Probability { get; set; }
    public float Score { get; set; }
}

how big is your dataset? if it is not big enough, you might not get a good model. — amy8374, Feb 08 '19 at 18:33
Dataset is not a problem. I checked with 10, 1000 and 100k. Something is off on the training config i guess. Because I am getting AUC : NaN. Just added the metrics screenshot for your reference. — Vikash Rathee, Feb 09 '19 at 01:24

Simple sentiment analysis using ml.net and IEnumerable dataview

0 Answers0