0

I am implementing the viterbi algorithm but it is performing poorly on POS tagging, I think there might be something wrong with my implementation inherently but my friend thinks its an issue of underflow. What do you guys think?

def predict(self, data_path, results_path):
    '''To predict the tags of the sentence in data_path
    @param data_path: the path of the data file
    @return: a list of lists, each list is a line of the data file with the predicted tag appended at the end'''

    # load data which will be a sentence stored as a list of words
    with open(data_path, 'r', encoding="utf-8") as f:
        raw_data = f.readlines()
        whole_document = [line.strip() for line in raw_data]
        output_sequences = []
        sub_sequence = []
        for line in whole_document:
            if line:
                sub_sequence.append(line)
            else:
                output_sequences.append(sub_sequence)
                sub_sequence = []

    
    # use viterbi algorithm
    # get subpaths
    # those subpaths make up the full path
    transition_params = self.transition_params
    emission_params = self.emission_params

    # get all possible unique tags
    tags = set(key[1] for key in emission_params.keys())
    tags.add("START")
    tags.add("STOP")

    results = []

    for output_sequence in output_sequences:
        pi = {position: {tag: 0 for tag in tags} for position in range(len(output_sequence) + 2)}
        pi[0]["START"] = 1

        # forward pass
        word_set = set(key[0] for key in emission_params.keys())
        for k in range(1, len(output_sequence) + 1):
            word = output_sequence[k - 1]
            if word not in word_set:
                word = "#UNK#"
            for current_tag in tags:
                if current_tag in ["START", "STOP"]:
                    continue
                max_prob = max([pi[k - 1][u] * math.log(transition_params.get((current_tag, u), 0) + 10**-10) * math.log(emission_params.get((word, current_tag), 0) + 10**-10) for u in tags])
                pi[k][current_tag] = max_prob

        pi[len(output_sequence) + 1]["STOP"] = max([pi[len(output_sequence)][u] * math.log(transition_params.get(("STOP", u), 0) + 10**-10) for u in tags])

        # backward pass
        y_stars = {position: None for position in range(len(output_sequence) + 1)}

        y_n_star = max(tags, key=lambda u: pi[len(output_sequence)][u] * transition_params.get(("STOP", u), 0))
        y_stars[len(output_sequence)] = y_n_star

        y_j_plus_1_star = y_n_star
        for j in range(len(output_sequence) - 1, 0, -1):
            y_j_star = max(tags, key=lambda u: pi[j][u] * transition_params.get((u, y_j_plus_1_star), 0))
            y_stars[j] = y_j_star
            y_j_plus_1_star = y_j_star

        sentence_tags = [f"{word} {y_stars[i+1]}" for i, word in enumerate(output_sequence)]
        results.append(sentence_tags)

        # print(results)

        # write the results to file
        with open(results_path, "w", encoding="utf-8") as file:
            for sentence in results:
                for word_tag in sentence:
                    file.write(f"{word_tag}\n")
                file.write("\n")
    return results

I tried taking the log of the probabilities in the recursion.

Vitalizzare
  • 4,496
  • 7
  • 13
  • 32

0 Answers0