import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
s_df=pd.read_csv('Sarcasm Dataset.csv')
s_df.rename({"Unnamed: 0":"number"}, axis="columns", inplace=True)
sarc_classify = s_df.drop(['number','sarcasm','irony','satire','understatement','overstatement','rhetorical_question'],axis=1)
X_train, X_test, y_train, y_test = train_test_split(sarc_classify['tweet'], sarc_classify['sarcastic'])
vectorizer = CountVectorizer()
X1=vectorizer.fit_transform(X_train.values.astype('U'))
X_train=X1.toarray()
X2=vectorizer.fit_transform(X_test.values.astype('U'))
X_test=np.array(X2.todense())
gnb = GaussianNB()
naive_bayes = gnb.fit(X_train, y_train)
y_pred =gnb.predict(X_test)
So, i am getting this error. and the X_train and y_train values looks like this, before vectorizer. So, all i want is to implement a basic Naive Bayes using Sklearn.
Error:
ValueError Traceback (most recent call last) <ipython-input-243-52354d6c7ca6> in <module>()
1 gnb = GaussianNB()
2 naive_bayes = gnb.fit(X_train, y_train)
----> 3 y_pred =gnb.predict(X_test)
4 acc_score = accuracy_score(y_test, y_pred)
5 print(acc_score)
3 frames /usr/local/lib/python3.7/dist-packages/sklearn/base.py in
_check_n_features(self, X, reset)
399 if n_features != self.n_features_in_:
400 raise ValueError(
--> 401 f"X has {n_features} features, but {self.__class__.__name__} "
402 f"is expecting {self.n_features_in_} features as input."
403 )
ValueError: X has 1549 features, but GaussianNB is expecting 3298 features as input.