I have been trying to code the Bayes Classifier for a few days in Java. I downlaoded the Iris-Dataset (https://www.kaggle.com/uciml/iris) for this. I wrote the code for loading the dataset (I converted the .csv file to .txt). It works fine. Then for it is a Bayes Classifier, I needed bucketized data. I came up with a logic for that, implemented it and it works fine. I made the required changes to my code. But when I ran the code again, it prints just -1 for all the values. When I tested my bucketing logic again, it was working fine! Please help. My bucketing logic returns the number of the bucket in which the given value would go.
Code for loading the dataset -
public class BayesianClassifier
{
//------------------working variables---------------------
private static OrderedMap<int [], String> dataset = loadDataset(); //method for loading the dataset. refer below.
private static int maxBuckets = 5; //these value are
private static double bucketSize = 1.0;//just for testing
private static double min = 4.0; //whether my bucketing logic
private static double max = 7.0; //works fine on the data
//------------------working methods-----------------------
private static OrderedMap<int [], String> loadDataset() {
OrderedMap<int [], String> dataset = new OrderedMap<int [], String>(); //I needed a map structure which stores entries in the same
// order as they are put. I coded it myself because there are
// no classes like that in Java.
try {
BufferedReader reader = new BufferedReader(new FileReader("F:\\File Transport Directory\\Bayesian\\Iris - Copy.txt"));
String line = reader.readLine(); //I skip the first line in the file because I do not need it.
String [] fullRow = line.split("\t"); //for initializing the size of a record.
int [] neededRow = new int[fullRow.length - 2]; //I do not need the first value (ID) and the last value(category).
// I will store category as the value in my OrderedMap.
while((line = reader.readLine()) != null) {
fullRow = line.split("\t");
for(int i = 1 ; i < fullRow.length - 1 ; i++) {
double value = Double.parseDouble(fullRow[i]); //parse the value as double because it is in String format
neededRow[i - 1] = mapToBucket(value, bucketSize, min, max, maxBuckets); //bucketize
}
String category = fullRow[fullRow.length - 1]; //category name to be stored
Entry<int [], String> toPut = new Entry<int [], String>(neededRow, category); //I also coded an Entry class because
// in Java it is an interface.
dataset.put(toPut);
}
reader.close();
} catch (Exception e) {
e.printStackTrace();
}
return dataset;
}
private static void printDataset() {
Entry<int [], String> en;
for(int i = 0 ; i < dataset.size() ; i++) {
en = dataset.entryAt(i);
Arr.printArrLine(en.key());
System.out.println("class : " + en.value() + "\n");
}
}
//------------------main-----------------------------------
public static void main(String [] args) {
printDataset(); //I print the dataset
}
}
Code for bucketing -
private static int mapToBucket(double value, double bucketSize, double min, double max, int maxBuckets) {
int noLimitBucket = (int) ((int)Math.ceil(value / bucketSize) - min); //The logic I came up with for bucketing
if(value < min) return 0;
else if(value > max) return maxBuckets - 1;
else if(noLimitBucket > maxBuckets - 1) return maxBuckets - 1;
else if(noLimitBucket < 0) return 0;
else return noLimitBucket;
}
OrderedMap Class -
public class OrderedMap<K, V>
{
private Vector<Entry<K, V>> dataset = new Vector<Entry<K, V>>();
private HashSet<Integer> keyHashCodes = new HashSet<Integer>();
private HashSet<Integer> valueHashCodes = new HashSet<Integer>();
public void put(Entry<K, V> toPut) {
if(containsKey(toPut.key())) {
keyHashCodes.remove(toPut.hashCode());
for(int i = 0 ; i < dataset.size() ; i++) {
if(dataset.elementAt(i).equals(toPut)) {
dataset.remove(i);
break;
}
}
}
dataset.add(toPut);
keyHashCodes.add(toPut.key().hashCode());
valueHashCodes.add(toPut.value().hashCode());
}
public V remove(K key) {
V value = null;
if(containsKey(key)) {
for(int i = 0 ; i < dataset.size() ; i++) {
if(dataset.elementAt(i).key().equals(key)) {
value = dataset.elementAt(i).value();
dataset.remove(i);
break;
}
}
}
return value;
}
public Entry<K, V> entryAt(int index) {
return dataset.elementAt(index);
}
public boolean containsKey(K key) {
if(keyHashCodes.contains(key.hashCode())) return true;
else return false;
}
public boolean containsValue(V value) {
if(valueHashCodes.contains(value.hashCode())) return true;
else return false;
}
public int size() {
return dataset.size();
}
}
Entry Class -
public class Entry<K, V>
{
private K key;
private V value;
Entry(K k, V v) {
key = k;
value = v;
}
public K key() {
return key;
}
public V value() {
return value;
}
}