i don't know how to train model in multiples batches with doc2vec . Since i load all my data in ram and it't can not be loaded
#Import all the dependencies
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
#import ReadExeFileCapstone
import update-doc2vec
mapData = ReadExeFileCapstone.readData()
# print ('mapData', mapData)
max_epochs = 10000
vec_size = 200
alpha = 0.025
model = Doc2Vec(size=vec_size,
alpha=alpha,
min_alpha=0.00025,
min_count=1,
dm =1)
data = []
for key in mapData:
listData = mapData[key]
# print ("listData: ", len(listData), listData)
for i in range(len(listData)):
listToStr = ' '.join([str(elem) for elem in listData[i]]) #convert array to list string
data.append(listToStr)
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]
model.build_vocab(tagged_data)
#build vocab
for epoch in range(max_epochs):
print('iteration {0}'.format(epoch))
model.train(tagged_data,
total_examples=model.corpus_count,
epochs=model.iter)
# decrease the learning rate
model.alpha -= 0.0002
# fix the learning rate, no decay
model.min_alpha = model.alpha
# train model
model.save("d2v_ASM.model")
print("Model Saved")