I am implementing the viterbi algorithm but it is performing poorly on POS tagging, I think there might be something wrong with my implementation inherently but my friend thinks its an issue of underflow. What do you guys think?
def predict(self, data_path, results_path):
'''To predict the tags of the sentence in data_path
@param data_path: the path of the data file
@return: a list of lists, each list is a line of the data file with the predicted tag appended at the end'''
# load data which will be a sentence stored as a list of words
with open(data_path, 'r', encoding="utf-8") as f:
raw_data = f.readlines()
whole_document = [line.strip() for line in raw_data]
output_sequences = []
sub_sequence = []
for line in whole_document:
if line:
sub_sequence.append(line)
else:
output_sequences.append(sub_sequence)
sub_sequence = []
# use viterbi algorithm
# get subpaths
# those subpaths make up the full path
transition_params = self.transition_params
emission_params = self.emission_params
# get all possible unique tags
tags = set(key[1] for key in emission_params.keys())
tags.add("START")
tags.add("STOP")
results = []
for output_sequence in output_sequences:
pi = {position: {tag: 0 for tag in tags} for position in range(len(output_sequence) + 2)}
pi[0]["START"] = 1
# forward pass
word_set = set(key[0] for key in emission_params.keys())
for k in range(1, len(output_sequence) + 1):
word = output_sequence[k - 1]
if word not in word_set:
word = "#UNK#"
for current_tag in tags:
if current_tag in ["START", "STOP"]:
continue
max_prob = max([pi[k - 1][u] * math.log(transition_params.get((current_tag, u), 0) + 10**-10) * math.log(emission_params.get((word, current_tag), 0) + 10**-10) for u in tags])
pi[k][current_tag] = max_prob
pi[len(output_sequence) + 1]["STOP"] = max([pi[len(output_sequence)][u] * math.log(transition_params.get(("STOP", u), 0) + 10**-10) for u in tags])
# backward pass
y_stars = {position: None for position in range(len(output_sequence) + 1)}
y_n_star = max(tags, key=lambda u: pi[len(output_sequence)][u] * transition_params.get(("STOP", u), 0))
y_stars[len(output_sequence)] = y_n_star
y_j_plus_1_star = y_n_star
for j in range(len(output_sequence) - 1, 0, -1):
y_j_star = max(tags, key=lambda u: pi[j][u] * transition_params.get((u, y_j_plus_1_star), 0))
y_stars[j] = y_j_star
y_j_plus_1_star = y_j_star
sentence_tags = [f"{word} {y_stars[i+1]}" for i, word in enumerate(output_sequence)]
results.append(sentence_tags)
# print(results)
# write the results to file
with open(results_path, "w", encoding="utf-8") as file:
for sentence in results:
for word_tag in sentence:
file.write(f"{word_tag}\n")
file.write("\n")
return results
I tried taking the log of the probabilities in the recursion.