I've written a program that takes a twitter data that contains tweets and labels (0
for neutral sentiment and 1
for negative sentiment) and predicts which category the tweet belongs to.
The program works well on the training and test Set. However I'm having problem in applying prediction function with a string. I'm not sure how to do that.
I have tried cleaning the string the way I cleaned the dataset before calling the predict function but the values returned are in wrong shape.
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
import re
#Loading dataset
dataset = pd.read_csv('tweet.csv')
#List to hold cleaned tweets
clean_tweet = []
#Cleaning tweets
for i in range(len(dataset)):
tweet = re.sub('[^a-zA-Z]', ' ', dataset['tweet'][i])
tweet = re.sub('@[\w]*',' ',dataset['tweet'][i])
tweet = tweet.lower()
tweet = tweet.split()
tweet = [ps.stem(token) for token in tweet if not token in set(stopwords.words('english'))]
tweet = ' '.join(tweet)
clean_tweet.append(tweet)
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 3000)
X = cv.fit_transform(clean_tweet)
X = X.toarray()
y = dataset.iloc[:, 1].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)
from sklearn.naive_bayes import GaussianNB
n_b = GaussianNB()
n_b.fit(X_train, y_train)
y_pred = n_b.predict(X_test)
some_tweet = "this is a mean tweet" # How to apply predict function to this string