I am trying to implement an online classifier using the 'passive agressive classifer' in scikit learn with the 20 news grops dataset. I am very new to this, thus I am not sure if I have implemented this properly. That being said, I developed a samll code but when I execute it I keep getting the error:
Traceback (most recent call last): File "/home/suleka/Documents/RNN models/passiveagressive.py", line 100, in clf.fit(X, y) File "/home/suleka/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/passive_aggressive.py", line 225, in fit coef_init=coef_init, intercept_init=intercept_init) File "/home/suleka/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/stochastic_gradient.py", line 444, in _fit classes, sample_weight, coef_init, intercept_init) File "/home/suleka/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/stochastic_gradient.py", line 407, in _partial_fit raise ValueError("The number of class labels must be " ValueError: The number of class labels must be greater than one.
I checked most posts in stackoverflow and they suggested there must be only one unique class. So i did np.unique(labels)
and it showed 20 (20 news groups):
[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19]
Can anyone help me out with this error and please let me know if I have implemented it wrong.
My code is shown below:
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.datasets import make_classification
from string import punctuation
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from collections import Counter
from sklearn.preprocessing import MinMaxScaler, LabelBinarizer
from sklearn.utils import shuffle
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('stopwords')
seed = 42
np.random.seed(seed)
def preProcess():
newsgroups_data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
stop_words='english')
features = vectorizer.fit_transform(newsgroups_data.data)
labels= newsgroups_data.target
return features, labels
if __name__ == '__main__':
features, labels = preProcess()
X_train, y_train = shuffle(features, labels, random_state=seed)
clf = PassiveAggressiveClassifier(random_state=seed)
n, d =X_train.shape
print(np.unique(labels))
error = 0
iteration = 0
for i in range(n):
print(iteration)
X, y = X_train[i:i + 1], y_train[i:i + 1]
clf.fit(X, y)
pred = clf.predict(X)
print(pred)
print(y)
if y - pred != 0:
error += 1
iteration += iteration
print(error)
print(np.divide(error, n, dtype=np.float))
Thank you in advance!