import pandas as pd
import numpy
from sklearn import cross_validation
from sklearn.naive_bayes import GaussianNB
fi = "df.csv"
# Open the file for reading and read in data
file_handler = open(fi, "r")
data = pd.read_csv(file_handler, sep=",")
file_handler.close()
# split the data into training and test data
train, test = cross_validation.train_test_split(data,test_size=0.6, random_state=0)
# initialise Gaussian Naive Bayes
naive_b = GaussianNB()
train_features = train.ix[:,0:127]
train_label = train.iloc[:,127]
test_features = test.ix[:,0:127]
test_label = test.iloc[:,127]
naive_b.fit(train_features, train_label)
test_data = pd.concat([test_features, test_label], axis=1)
test_data["p_malw"] = naive_b.predict_proba(test_features)
print "test_data\n",test_data["p_malw"]
print "Accuracy:", naive_b.score(test_features,test_label)
I have written this code to accept input from a csv file with 128 columns where 127 columns are features and the 128th column is the class label.
I want to predict probability that the sample belongs to each class (There are 5 classes (1-5)) and print it in for of a matrix and determine the class of sample based on the prediction. predict_proba() is not giving the desired output. Please suggest required changes.