1

I have a test program that does not give consistent results for Accord.Net K-Means.

I am enclosing a reproducible test program that can be run in Visual Studio 2013.

The program is a console application and to reproduce the results you need to reference:

Accord.MachineLearning
Accord.Statistics,

from the Accord.Net 2.15 library.

When I run the program several times I get different results each time. The program uses the classic Fisher Iris dataset. The dataset has 150 rows, and I split the data into 120 rows of training data and 30 rows of testing data.

When I run the program I might get 26 out of 30 classified correctly. Running it again may produce 2 out of 30 correct.

For example:

 Number correct: 2 out of 30
         FScore: NaN
      Precision: 0
 True Positives: 0
False Positives: 9
 True Negatives: 9
False Negatives: 12
       Accuracy: 0.3
 Standard Error: 0.107268513868515
       Variance: 0.0115065340675597

I am wondering if I am correctly using Accord.Net. Any help will be greatly appreciated.

My program is:

using System;
using System.IO;
using System.Net;

using Accord.MachineLearning;
using Accord.Statistics.Analysis;

namespace K_Keans {

  #region K_Means
  public static class K_Means {
    private static KMeans kmeans;

    #region DowloadIrisData
    private static void DowloadIrisData(out double[][] predictors, out int[] targets) {
      using (var fileDownloader = new WebClient()) {
        // http://www.math.uah.edu/stat/data/Fisher.html
        // The dataset gives Ronald Fisher's measurements of type, petal width (PW), petal length (PL),
        // sepal width (SW), and sepal length (SL) for a sample of 150 irises, measured in millimeters. 
        // Type 0 is Setosa; type 1 is Verginica; and type 2 is Versicolor.
        const string webLocation = @"http://www.math.uah.edu/stat/data/Fisher.csv";
        const string fileName = @"c:\Temp\iris.csv";
        fileDownloader.DownloadFile(webLocation, fileName);
        var s = File.ReadAllText(fileName);
        var sarray = s.Split('\n');
        var nrows = sarray.Length - 2;
        var ncols = sarray[0].Split(',').Length;
        predictors = new double[nrows][];
        targets = new int[nrows];
        for (var j=1; j<=nrows; j++) {
          predictors[j-1] = new double[ncols-1];
          var line = sarray[j].Split(',');
          for (var k = 1; k < ncols; k++) {
            targets[j-1] = Convert.ToInt32(line[0]);
            predictors[j-1][k-1] = Convert.ToDouble(line[k]);
          }
        }
      }
    }
    #endregion

    #region IrisData
    public static void IrisData(out double[][] trainingData, out int[] expectedTrainingTargets,
                                out double[][] testingData, out int[] expectedTestingTargets) {
      double[][] predictors;
      int[] targets;
      DowloadIrisData(out predictors, out targets);

      var nRows = predictors.Length;
      var nCols = predictors[0].Length;
      var nRowsTesting = Convert.ToInt32(0.2*nRows);
      var nRowsTraining = nRows - nRowsTesting;

      trainingData = new double[nRowsTraining][];
      expectedTrainingTargets = new int[nRowsTraining];
      for (var k = 0; k < nRowsTraining; k++) {
        trainingData[k] = new double[nCols];
        Array.Copy(predictors[k], trainingData[k], nCols);
        expectedTrainingTargets[k] = targets[k];
      }
      testingData = new double[nRowsTesting][];
      expectedTestingTargets = new int[nRowsTesting];
      for (var k = 0; k < nRowsTesting; k++) {
        testingData[k] = new double[nCols];
        Array.Copy(predictors[nRows-nRowsTesting+k], testingData[k], nCols);
        expectedTestingTargets[k] = targets[nRows-nRowsTesting+k];
      }
    }
    #endregion

    #region Train
    public static void Train(double[][] trainingData, out int[] predicted) {

      kmeans = new KMeans(3) {
        Tolerance = 1e-5,
        ComputeInformation = true
      };

      predicted = kmeans.Compute(trainingData);
    }
    #endregion

    #region Test
    public static void Test(double[][] testingData, out int[] predicted) {
      var nRowsTesting = testingData.Length;
      predicted = new int[nRowsTesting];
      for (var k = 0; k < nRowsTesting; k++) {
        predicted[k] = kmeans.Clusters.Nearest(testingData[k]);
      }
    }
    #endregion
  }
  #endregion

  class Program {
    static void Main(string[] args) {
      double[][] trainingData, testingData;
      int[] expectedTrainingTargets, expectedTestingTargets;

      K_Means.IrisData(out trainingData, out expectedTrainingTargets, out testingData, out expectedTestingTargets);
      
      int[] predictedTrainingTargets;
      K_Means.Train(trainingData, out predictedTrainingTargets);

      int[] predictedTestingTargets;
      K_Means.Test(testingData, out predictedTestingTargets);

      var confusionMatrix = new ConfusionMatrix(predictedTestingTargets, expectedTestingTargets);

      var nCorrect = 0;
      var nRows = expectedTestingTargets.Length;
      for (var k=0; k<nRows; k++) {
        if (predictedTestingTargets[k] == expectedTestingTargets[k]) { nCorrect++; }
      }

      Console.WriteLine(" Number correct: {0} out of {1}", nCorrect, nRows);
      Console.WriteLine("         FScore: {0}", confusionMatrix.FScore);
      Console.WriteLine("      Precision: {0}", confusionMatrix.Precision);
      Console.WriteLine(" True Positives: {0}", confusionMatrix.TruePositives);
      Console.WriteLine("False Positives: {0}", confusionMatrix.FalsePositives);
      Console.WriteLine(" True Negatives: {0}", confusionMatrix.TrueNegatives);
      Console.WriteLine("False Negatives: {0}", confusionMatrix.FalseNegatives);
      Console.WriteLine("       Accuracy: {0}", confusionMatrix.Accuracy);
      Console.WriteLine(" Standard Error: {0}", confusionMatrix.StandardError);
      Console.WriteLine("       Variance: {0}", confusionMatrix.Variance);
      Console.WriteLine(" ");
      Console.WriteLine("Hit enter to exit.");
      Console.ReadKey();
    }
  }
}
Alexander Schmidt
  • 5,631
  • 4
  • 39
  • 79
CBrauer
  • 1,035
  • 2
  • 11
  • 17

1 Answers1

3

K-means is not a classification algorithm.

But it is a randomized algorithm, so it's not surprise you get different results every time.

Now since it is randomized the labels used by k-means are random, too.

So 2 out of 30 correct may be the same as 28 out of 30 correct (just the labels shuffled).

Run it again, and it may yield the same clusters, but with the "labels" all mixed up. (In fact, it doesn't know about iris species. It labels objects 0,1,2; not "iris setosa")

Has QUIT--Anony-Mousse
  • 76,138
  • 12
  • 138
  • 194