Given that I have a data set with 2 columns containing text data, I have to concatenate these 2 columns and then find the top 2k words using idf_ values and then use these words to create a co-occurrence matrix. I am getting with below code an index error. Can any one please provide me the code to get the working co occurrence matrix.
singular value decomposition: SVD
def get_words_in_window(sent, w, window = 5):
context_words = []
for index, word in enumerate(sentence.split()):
if word == w:
if index < window:
lower_index = 0
upper_index = window+index
elif len(sentence.split()) - index <= window:
lower_index = index - window
upper_index = len(sentence.split())-1
else:
lower_index = index - window
upper_index = index + window
for i in range(lower_index, upper_index+1):
if i != index:
context_words.append(sentence.split()[i])
return context_words
from tqdm import tqdm
for sentence in tqdm(essays_titles['essay_title']):
for w in sentence.split():
if w in top_2k_words:
context_words = get_words_in_window(sentence, w)
for w2 in context_words:
if w2 in top_2k_words:
cooc_matrix[top_2k_words.index(w)][top_2k_words.index(w2)]+=1
The error:
IndexError: list index out of range