def fetchData(fileName,modelObj):
data = pd.read_csv('C:/Users/Owner/Desktop/Project/Datasets/Data.csv')
print ("Enter the size of data to train and test: ")
dataSize = input()
data=data.loc[:dataSize]
trainDataSize=int(abs(float(dataSize) * 0.8))
testStartIndex=int(trainDataSize)
testEndIndex=int(dataSize)
#fetching data text feature from data set for training
X_train=data.iloc[:trainDataSize,2].values
#fetching real or fake feature from data set for training
y_train=data.iloc[:trainDataSize,-1].values
#fetching data text feature from data set for testing
X_test=data.iloc[testStartIndex:testEndIndex,2].values
#fetching data text feature from data set for testing
y_test=data.iloc[testStartIndex:testEndIndex,-1].values
print ("The data split is as follows:")
print ("X-train :",len(X_train))
print ("Y-train :",len(y_train))
print ("X-test :",len(X_test))
print ("Y-test :",len(y_test))
'''fetch stop words list from nltk '''
stopwords_=[word.encode('utf-8')for word in list(stopwords.words('english'))]
#print stopwords_
'''Optimization of feature generation based on Model'''
if modelObj.__class__.__name__!='GridSearchCV':
maxFeatures=50000
else:
maxFeatures=10000
''' intiallize tfidf object '''
''' feature generation -> tfidf { parameters max_features set to a fixed number to produce results fast,
stop_words are removed by initializing the param stop_words using a
stop words list fetched using NLTK lib }'''
tfidf = TfidfVectorizer(min_df = 1, max_features = maxFeatures, stop_words=stopwords_)
''' Generate TF-IDF Feature for train and test data'''
tfidfTrain = tfidf.fit_transform(X_train)
tfidfTest= tfidf.transform(X_test)
Traceback for the error,
AttributeError Traceback (most recent call last)
<ipython-input-6-28e9ec41b050> in <module>
8 if __name__ == '__main__':
9 print ("Welcome to Fake News Detection")
---> 10 selectTasks()
<ipython-input-5-4497d6866537> in selectTasks()
27 else:
28 print ("Classification on "+MODEL[x])
---> 29 runModel(options[x](PARAMS[x]))
30
<ipython-input-3-1e5fd0540fe3> in runModel(modelObj)
3 #fileName=input()
4 ''' fetch the data split '''
----> 5 X_train,y_train,X_test,y_test=fetchData('C:/Users/Owner/Desktop/Project/Datasets/Data.csv',modelObj)
6 Visualize.plotInitalData(X_train,y_train)
7 ''' fit the Train data '''
<ipython-input-2-116c6a1f9b37> in fetchData(fileName, modelObj)
35 tfidf = TfidfVectorizer(min_df = 1, max_features = maxFeatures, stop_words=stopwords_)
36 ''' Generate TF-IDF Feature for train and test data'''
---> 37 tfidfTrain = tfidf.fit_transform(X_train)
38 tfidfTest= tfidf.transform(X_test)
39
c:\python39\lib\site-packages\sklearn\feature_extraction\text.py in fit_transform(self, raw_documents, y)
1844 """
1845 self._check_params()
-> 1846 X = super().fit_transform(raw_documents)
1847 self._tfidf.fit(X)
1848 # X is already a transformed view of raw_documents so
c:\python39\lib\site-packages\sklearn\feature_extraction\text.py in fit_transform(self, raw_documents, y)
1200 max_features = self.max_features
1201
-> 1202 vocabulary, X = self._count_vocab(raw_documents,
1203 self.fixed_vocabulary_)
1204
c:\python39\lib\site-packages\sklearn\feature_extraction\text.py in _count_vocab(self, raw_documents, fixed_vocab)
1112 for doc in raw_documents:
1113 feature_counter = {}
-> 1114 for feature in analyze(doc):
1115 try:
1116 feature_idx = vocabulary[feature]
c:\python39\lib\site-packages\sklearn\feature_extraction\text.py in _analyze(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)
102 else:
103 if preprocessor is not None:
--> 104 doc = preprocessor(doc)
105 if tokenizer is not None:
106 doc = tokenizer(doc)
c:\python39\lib\site-packages\sklearn\feature_extraction\text.py in _preprocess(doc, accent_function, lower)
67 """
68 if lower:
---> 69 doc = doc.lower()
70 if accent_function is not None:
71 doc = accent_function(doc)
AttributeError: 'numpy.int64' object has no attribute 'lower'
I am getting this error and am not able to debug it. Please help. I tried converting the vectors into arrays using tfidfTrain = tfidf.fit_transform(X_train).toarray()
and also tfidfTest= tfidf.transform(X_test).toarray()
, but it is giving me the same error. I am unable to understand what should I do?