I am using below code to construct document term matrix in python.
# Importing the libraries
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
dataset=pd.read_csv("trainset.csv",encoding = "ISO-8859-1")
dataset['ProductDescription']=dataset['ProductDescription'].str.replace('[^\w\s]', ' ')
dataset['ProductDescription']=dataset['ProductDescription'].str.replace('[\d]', ' ')
dataset['ProductDescription']=dataset['ProductDescription'].str.lower()
stop = set(stopwords.words('english'))
dataset['ProductDescription']=dataset['ProductDescription'].str.replace(r'\b(' + r'|'.join(stop) + r')\b\s*', ' ')
vectorizer = CountVectorizer()
x1 = vectorizer.fit_transform(dataset['ProductDescription'].values.astype('U'))
df = pd.DataFrame(x1.toarray().transpose(), index = vectorizer.get_feature_names())
For 10000 dataset the code is working fine but when i consider large dataset of around 1100000, I am getting memory error when i execute
df = pd.DataFrame(x1.toarray().transpose(), index = vectorizer.get_feature_names())
Can somebody please tell me where i have gone wrong?