I am working on a Grading System ( graduation project ). I have preprocessed the data, then used TfidfVectorizer on the data and used LinearSVC to fit the model.
The System goes as follows, it has 265 definitions, of arbitrary lengths; but in total, they sum up to shape of (265, 8581 ) so when I try to input some new random sentence to predict against it, I get this message
you could have a look at the code used ( Full & long ) if you want to;
Code used;
def normalize(df):
lst = []
for x in range(len(df)):
text = re.sub(r"[,.'!?]",'', df[x])
lst.append(text)
filtered_sentence = ' '.join(lst)
return filtered_sentence
def stopWordRemove(df):
stop = stopwords.words("english")
needed_words = []
for x in range(len(df)):
words = word_tokenize(df)
for word in words:
if word not in stop:
needed_words.append(word)
return needed_words
def prepareDataSets(df):
sentences = []
for index, d in df.iterrows():
Definitions = stopWordRemove(d['Definitions'].lower())
Definitions_normalized = normalize(Definitions)
if d['Results'] == 'F':
sentences.append([Definitions, 'false'])
else:
sentences.append([Definitions, 'true'])
df_sentences = DataFrame(sentences, columns=['Definitions', 'Results'])
for x in range(len(df_sentences)):
df_sentences['Definitions'][x] = ' '.join(df_sentences['Definitions'][x])
return df_sentences
def featureExtraction(data):
vectorizer = TfidfVectorizer(min_df=10, max_df=0.75, ngram_range=(1,3))
tfidf_data = vectorizer.fit_transform(data)
return tfidf_data
def learning(clf, X, Y):
X_train, X_test, Y_train, Y_test = \
cross_validation.train_test_split(X,Y, test_size=.2,random_state=43)
classifier = clf()
classifier.fit(X_train, Y_train)
predict = cross_validation.cross_val_predict(classifier, X_test, Y_test, cv=5)
scores = cross_validation.cross_val_score(classifier, X_test, Y_test, cv=5)
print(scores)
print ("Accuracy of %s: %0.2f(+/- %0.2f)" % (classifier, scores.mean(), scores.std() *2))
print (classification_report(Y_test, predict))
Then I run these scripts : which I get the mentioned error after
test = LinearSVC()
data, target = preprocessed_df['Definitions'], preprocessed_df['Results']
tfidf_data = featureExtraction(data)
X_train, X_test, Y_train, Y_test = \
cross_validation.train_test_split(tfidf_data,target, test_size=.2,random_state=43)
test.fit(tfidf_data, target)
predict = cross_validation.cross_val_predict(test, X_test, Y_test, cv=10)
scores = cross_validation.cross_val_score(test, X_test, Y_test, cv=10)
print(scores)
print ("Accuracy of %s: %0.2f(+/- %0.2f)" % (test, scores.mean(), scores.std() *2))
print (classification_report(Y_test, predict))
Xnew = ["machine learning is playing games in home"]
tvect = TfidfVectorizer(min_df=1, max_df=1.0, ngram_range=(1,3))
X_test= tvect.fit_transform(Xnew)
ynew = test.predict(X_test)