I'm trying to use OpenCV3.1's NormalBayesClassifier
on a simple problem that I can easily generate training data for. I settled on classifying input numbers as even or odd. Obviously this can be computed directly with 100% accuracy, but the point is to exercise the ML capabilities of OpenCV in order to get familiar with it.
So, my first question is - is there a theoretical reason why NormalBayesClassifier
wouldn't be an appropriate model for this problem?
If not, the second question is, why is my error rate so high? cv::ml::StatModel::calcError()
is giving me outputs of 30% - 70%.
Third, what's the best way to bring the error rate down?
Here's a minimum, self-contained snippet that demonstrates the issue:
(To be clear, the classification/output should be 0
for an even number and 1
for an odd number).
#include <ml.h>
#include <iomanip>
int main() {
const int numSamples = 1000;
cv::RNG rng = cv::RNG::RNG((uint64) time(NULL));
// construct training sample data
cv::Mat samples;
samples.create(numSamples, 1, CV_32FC1);
for (int i = 0; i < numSamples; i++) {
samples.at<float>(i) = (int)rng(10000);
}
// construct training response data
cv::Mat responses;
responses.create(numSamples, 1, CV_32SC1);
for (int i = 0; i < numSamples; i++) {
int sample = (int) samples.at<float>(i);
int response = (sample % 2);
responses.at<int>(i) = response;
}
cv::Ptr<cv::ml::TrainData> data = cv::ml::TrainData::create(samples, cv::ml::ROW_SAMPLE, responses);
data->setTrainTestSplitRatio(.9);
cv::Ptr<cv::ml::NormalBayesClassifier> classifier = cv::ml::NormalBayesClassifier::create();
classifier->train(data);
float errorRate = classifier->calcError(data, true, cv::noArray());
std::cout << "Bayes error rate: [" << errorRate << "]" << std::endl;
// construct prediction inputs
const int numPredictions = 10;
cv::Mat predictInputs;
predictInputs.create(numPredictions, 1, CV_32FC1);
for (int i = 0; i < numPredictions; i++) {
predictInputs.at<float>(i) = (int)rng(10000);
}
cv::Mat predictOutputs;
predictOutputs.create(numPredictions, 1, CV_32SC1);
// run prediction
classifier->predict(predictInputs, predictOutputs);
int numCorrect = 0;
for (int i = 0; i < numPredictions; i++) {
int input = (int)predictInputs.at<float>(i);
int output = predictOutputs.at<int>(i);
bool correct = (input % 2 == output);
if (correct)
numCorrect++;
std::cout << "Input = [" << (int)predictInputs.at<float>(i) << "], " << "predicted output = [" << predictOutputs.at<int>(i) << "], " << "correct = [" << (correct ? "yes" : "no") << "]" << std::endl;
}
float percentCorrect = (float)numCorrect / numPredictions * 100.0f;
std::cout << "Percent correct = [" << std::fixed << std::setprecision(0) << percentCorrect << "]" << std::endl;
}
Sample run output:
Bayes error rate: [36]
Input = [9150], predicted output = [1], correct = [no]
Input = [3829], predicted output = [0], correct = [no]
Input = [4985], predicted output = [0], correct = [no]
Input = [8113], predicted output = [1], correct = [yes]
Input = [7175], predicted output = [0], correct = [no]
Input = [811], predicted output = [1], correct = [yes]
Input = [699], predicted output = [1], correct = [yes]
Input = [7955], predicted output = [1], correct = [yes]
Input = [8282], predicted output = [1], correct = [no]
Input = [1818], predicted output = [0], correct = [yes]
Percent correct = [50]