i have two separate data sets, one is resumes and the other is demands, using gensim doc2vec, i created models for each and i am able to query similar words in each data sets, but now, i need to merge these two models into one and query for resumes in demands and attain the similarity or matching between them. My data sets are in plain txt files in which the the two resumes or demands are separated by * . Please find my implementation below, any suggestions would be highly appreciated. Thanks.
import gensim
import os
import collections
import smart_open
import random
def read_corpus(fname, tokens_only=False):
with open(fname) as f:
i=0
for line in (f.read().split('&&')):
if len(line)>1:
if tokens_only:
yield gensim.utils.simple_preprocess(line)
else:
# For training data, add tags
yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])
i+=1
vocabulary = read_corpus('D:\Demand.txt')
train_corpus = list(vocabulary)
print(train_corpus[:2])
model = gensim.models.doc2vec.Doc2Vec(size=50, min_count=2, iter=55)
model.build_vocab(train_corpus)
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.iter)
print(model.infer_vector(['trainings', 'certifications', 'analyst', 'unix', 'jdbc','testing']))
model.docvecs.most_similar(positive=[model.infer_vector(['spark', 'sqoop'])])
model.most_similar('unix')