I have trained a model using count_vectorizer, Tfidf_transformer and sgd classifier.
This is the tokenizer part
from keras.preprocessing.text import Tokenizer
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(master_df['Observation'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
I trained the model
from sklearn.linear_model import SGDClassifier
cv=CountVectorizer(max_df=1.0,min_df=1, stop_words=stop_words, max_features=10000, ngram_range=(1,3))
X=cv.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42, stratify=y)
sgd = Pipeline([('tfidf', TfidfTransformer()),
('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
])
sgd.fit(X_train, y_train)
y_pred = sgd.predict(X_test)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))
This part works fine When I try to use this model to predict using this code
sentence="Drill was not in operation in the mine at the time of visit."
test=preprocess_text(sentence)
test=test.lower()
print(test)
test=[test]
tokenizer.fit_on_texts(test)
word_index = tokenizer.word_index
#print(word_index)
test1=cv.transform(test)
print(test1)
output=sgd.predict(test1)
output
It gives me this error.
ValueError: Input has n_features=12 while the model has been trained with n_features=2494
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_18044/596445027.py in <module>
9 test1=cv.fit_transform(test)
10 print(test1)
---> 11 output=sgd.predict(test1)
12 output
~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\utils\metaestimators.py in <lambda>(*args, **kwargs)
118
119 # lambda, but not partial, allows help() to work with update_wrapper
--> 120 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
121 # update the docstring of the returned function
122 update_wrapper(out, self.fn)
~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\pipeline.py in predict(self, X, **predict_params)
416 Xt = X
417 for _, name, transform in self._iter(with_final=False):
--> 418 Xt = transform.transform(Xt)
419 return self.steps[-1][-1].predict(Xt, **predict_params)
420
~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\feature_extraction\text.py in transform(self, X, copy)
1491 expected_n_features = self._idf_diag.shape[0]
1492 if n_features != expected_n_features:
-> 1493 raise ValueError("Input has n_features=%d while the model"
1494 " has been trained with n_features=%d" % (
1495 n_features, expected_n_features))
ValueError: Input has n_features=12 while the model has been trained with n_features=2494
I think the problem lies in word_index=tokenizer
line but I dont know how to rectify it.