Fake News Detection AttributeError: 'numpy.int64' object has no attribute 'lower'

Question

def fetchData(fileName,modelObj):

    data = pd.read_csv('C:/Users/Owner/Desktop/Project/Datasets/Data.csv')
    print ("Enter the size of data to train and test: ")
    dataSize = input()
    data=data.loc[:dataSize]
    trainDataSize=int(abs(float(dataSize) * 0.8))
    testStartIndex=int(trainDataSize)
    testEndIndex=int(dataSize)

#fetching data text feature from data set for training 
    X_train=data.iloc[:trainDataSize,2].values

#fetching real or fake  feature from data set for training
    y_train=data.iloc[:trainDataSize,-1].values

#fetching data text feature from data set for testing
    X_test=data.iloc[testStartIndex:testEndIndex,2].values

#fetching data text feature from data set for testing
    y_test=data.iloc[testStartIndex:testEndIndex,-1].values  
    print ("The data split is as follows:")
    print ("X-train :",len(X_train))
    print ("Y-train :",len(y_train)) 
    print ("X-test :",len(X_test))
    print ("Y-test :",len(y_test))

 '''fetch stop words list from nltk '''
    stopwords_=[word.encode('utf-8')for word in list(stopwords.words('english'))]
    #print stopwords_

    '''Optimization of feature generation based on Model'''
    if modelObj.__class__.__name__!='GridSearchCV':
        maxFeatures=50000
    else:
        maxFeatures=10000

    ''' intiallize tfidf object  '''
    ''' feature generation -> tfidf { parameters max_features set to a fixed number to produce results fast,
                                     stop_words are removed by initializing the param stop_words using a 
                                     stop words list fetched using NLTK lib }'''    
    tfidf = TfidfVectorizer(min_df = 1, max_features = maxFeatures, stop_words=stopwords_)

    ''' Generate TF-IDF Feature for train and test data'''
    tfidfTrain = tfidf.fit_transform(X_train)
    tfidfTest= tfidf.transform(X_test)

Traceback for the error,

AttributeError                            Traceback (most recent call last)
<ipython-input-6-28e9ec41b050> in <module>
      8 if __name__ == '__main__':
      9     print ("Welcome to Fake News Detection")
---> 10     selectTasks()

<ipython-input-5-4497d6866537> in selectTasks()
     27         else:
     28             print ("Classification on "+MODEL[x])
---> 29             runModel(options[x](PARAMS[x]))
     30 

<ipython-input-3-1e5fd0540fe3> in runModel(modelObj)
      3     #fileName=input()
      4     ''' fetch the data split '''
----> 5     X_train,y_train,X_test,y_test=fetchData('C:/Users/Owner/Desktop/Project/Datasets/Data.csv',modelObj)
      6     Visualize.plotInitalData(X_train,y_train)
      7     ''' fit the Train data '''

<ipython-input-2-116c6a1f9b37> in fetchData(fileName, modelObj)
     35     tfidf = TfidfVectorizer(min_df = 1, max_features = maxFeatures, stop_words=stopwords_)
     36     ''' Generate TF-IDF Feature for train and test data'''
---> 37     tfidfTrain = tfidf.fit_transform(X_train)
     38     tfidfTest= tfidf.transform(X_test)
     39 

c:\python39\lib\site-packages\sklearn\feature_extraction\text.py in fit_transform(self, raw_documents, y)
   1844         """
   1845         self._check_params()
-> 1846         X = super().fit_transform(raw_documents)
   1847         self._tfidf.fit(X)
   1848         # X is already a transformed view of raw_documents so

c:\python39\lib\site-packages\sklearn\feature_extraction\text.py in fit_transform(self, raw_documents, y)
   1200         max_features = self.max_features
   1201 
-> 1202         vocabulary, X = self._count_vocab(raw_documents,
   1203                                           self.fixed_vocabulary_)
   1204 

c:\python39\lib\site-packages\sklearn\feature_extraction\text.py in _count_vocab(self, raw_documents, fixed_vocab)
   1112         for doc in raw_documents:
   1113             feature_counter = {}
-> 1114             for feature in analyze(doc):
   1115                 try:
   1116                     feature_idx = vocabulary[feature]

c:\python39\lib\site-packages\sklearn\feature_extraction\text.py in _analyze(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)
    102     else:
    103         if preprocessor is not None:
--> 104             doc = preprocessor(doc)
    105         if tokenizer is not None:
    106             doc = tokenizer(doc)

c:\python39\lib\site-packages\sklearn\feature_extraction\text.py in _preprocess(doc, accent_function, lower)
     67     """
     68     if lower:
---> 69         doc = doc.lower()
     70     if accent_function is not None:
     71         doc = accent_function(doc)

AttributeError: 'numpy.int64' object has no attribute 'lower'

I am getting this error and am not able to debug it. Please help. I tried converting the vectors into arrays using tfidfTrain = tfidf.fit_transform(X_train).toarray() and also tfidfTest= tfidf.transform(X_test).toarray(), but it is giving me the same error. I am unable to understand what should I do?

`lower` is a string method. So apparently the `transform` function expects string input here, not numbers. — hpaulj, Jul 03 '21 at 16:19
Searching `[sklearn] lower`, I find https://stackoverflow.com/questions/40534082/sklearn-pipeline-fit-attributeerror-lower-not-found, https://stackoverflow.com/questions/50192763/python-sklearn-pipiline-fit-attributeerror-lower-not-found, https://stackoverflow.com/questions/64561219/error-int-object-has-no-attribute-lower-with-regards-to-countvectorizer-a and more — hpaulj, Jul 03 '21 at 20:13

Fake News Detection AttributeError: 'numpy.int64' object has no attribute 'lower'

0 Answers0