I am trying to make a simple text generator using the Bulgarian language but my code is stuck in an endless loop. Here is the code:
from tokenization import tokenize_bulgarian_text
from nltk import bigrams, trigrams
from collections import Counter, defaultdict
import random
with open('IvanVazov1.txt', 'r', encoding='utf-8') as f:
data = f.read()
# Tokenize text
tokenized_sentences = tokenize_bulgarian_text(data)
print(tokenized_sentences)
# Create a placeholder for model
model = defaultdict(lambda: defaultdict(lambda: 0))
# Count frequency of co-occurance
for sentence in tokenized_sentences:
for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
print("Trigram:", (w1, w2, w3))
model[(w1, w2)][w3] += 1
# Transform the counts to probabilities
for w1_w2 in model:
total_count = float(sum(model[w1_w2].values()))
for w3 in model[w1_w2]:
model[w1_w2][w3] /= total_count
print(model)
# starting words
text = ["беше"]
sentence_finished = False
print("Starting words:", text)
while not sentence_finished:
# select a random probability threshold
r = random.random()
accumulator = .0
for word in model[tuple(text[-2:])].keys():
**# print statement never executes**
print(f"word: {word}, probability: {model[tuple(text[-2:])][word]}, accumulator: {accumulator}")
accumulator += model[tuple(text[-2:])][word]
# select words that are above the probability threshold
if accumulator >= r:
text.append(word)
break
if text[-2:] == [None, None]:
print("End of sentence.")
sentence_finished = True
print (' '.join([t for t in text if t]))
Here is my tokenization.py file that I import:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import string
from spacy.lang.bg.stop_words import STOP_WORDS as bg_stopwords
def tokenize_bulgarian_text(text):
extra_stopwords = {"—", "“", "„", "не", "та", "па"}
bg_stopwords.update(extra_stopwords)
# Remove punctuation and lowercase all letters
text = text.translate(str.maketrans('', '', string.punctuation))
text = text.lower()
# Split the text into individual sentences
sentences = sent_tokenize(text)
# Tokenize and filter each sentence
filtered_tokens = []
for sentence in sentences:
# Split the sentence into individual words or tokens
tokens = word_tokenize(sentence)
# Remove any stop words from the tokenized text
filtered_sentence = [word for word in tokens if word not in bg_stopwords]
filtered_tokens.append(filtered_sentence)
return filtered_tokens
Also the text file I use for data:
Нощта беше влажна и мрачна и браилските улици пустееха. Студената декемврийска мъгла, която обикновено пада покрай бреговете на Дунава, се беше напластила в една от главните улици на града и задушаваше с отровния си дъх последните минувачи, които бързаха да се приберат у дома си.
From what I have gathered I think the tokenization is fine and the triagrams are correct. I tried using print sentences to debug the code. For some reason the print statement in the nested for loop never executes. New words are never appended to the text list. I copy pasted the code from a website and it works fine with reuters.sents()
instead of my tokenized_sentences
.