I am trying to tag sentences using a custom defined dictionary. For example if I have two text files (1. My sentences, 2. My dictionary)
Sentences file:
I have abdominal pain and have difficulty breathing
Dictionary file:
Abdominal pain, difficulty breathing
I would like output to be like this:
New file:
I have abdominal pain (AE) and have difficulty breathing (AE)
How can this be done? Please see the following code:
import csv
from difflib import SequenceMatcher as SM
from nltk.util import ngrams
import codecs
with open('dictionary.csv','r') as csvFile:
reader = csv.reader(csvFile)
myfile = open("sentences.txt", "rt")
my3file = open("tagged_sentences.txt", "w")
hay = myfile.read()
myfile.close()
phrases = []
for row in reader:
needle = row[1]
needle_length = len(needle.split())
max_sim_val = 0.9
max_sim_string = u""
for ngram in ngrams(hay.split(), needle_length + int(.2 * needle_length)):
hay_ngram = u" ".join(ngram)
similarity = SM(None, hay_ngram, needle).ratio()
if similarity > max_sim_val:
max_sim_val = similarity
max_sim_string = hay_ngram
str = [row[1] , ' ', max_sim_val.__str__(),' ', max_sim_string , '\n']
str1 = max_sim_string , row[2]
phrases.append((max_sim_string, row[2]))
for line in hay.splitlines():
if any(max_sim_string in line for max_sim_string, _ in phrases):
for phrase in phrases:
max_sim_string, _ = phrase
if max_sim_string in line:
tag_sent = line.replace(max_sim_string, phrase.__str__())
my3file.writelines(tag_sent + '\n')
print(tag_sent)
break
else:
my3file.writelines(line + '\n')
csvFile.close()
The above code just creates an empty "tagged_sentences" file Thanks