I'm trying to train a Doc2vec for massive data. I have a 20k files with 72GB in total, and write this code:
def train():
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
data = []
random.shuffle(onlyfiles)
tagged_data = []
t = 0
try:
for file_name in onlyfiles:
with open(mypath+"/"+file_name, 'r', encoding="utf-8") as file:
txt = file.read()
tagged_data.append([word_tokenize(txt.lower()), [str(t)]])
t+=1
except Exception as e:
print(t)
return
print("Files Loaded")
max_epochs = 1000
vec_size = 500
alpha = 0.025
model = Doc2Vec(vector_size=vec_size,
alpha=alpha, workers=1,
min_alpha=0.00025,
min_count=1,
dm=1)
print("Model Works")
print("Building vocabulary")
model.build_vocab(tagged_data)
print("Trainning")
for epoch in range(max_epochs):
print("Iteration {0}".format(epoch))
model.train(tagged_data,
total_examples=model.corpus_count,
epochs=model.iter)
model.alpha -= 0.0002
model.min_alpha = model.alpha
model.save(model_name)
print("Model Saved")
But when I run this method, this error appears: Traceback (most recent call last):
File "doc2vec.py", line 20, in train
tagged_data.append([word_tokenize(txt.lower()), [str(t)]])
MemoryError
And only 3k files are treated. But when view memory, the python
process show that only 1.7% from memory was used.
Is there any parameter I can inform to python
to solve?
How can I fix it?