I am trying to execute the steps given in a blog post (https://stackabuse.com/gpt-style-text-generation-in-python-with-tensorflowkeras/) but getting the error in the below block:
vectorize_layer.adapt(text_list)
vocab = vectorize_layer.get_vocabulary()
vocab_size = len(vocab)
print(vocab_size )
Here is the full code:
from tensorflow import keras
import keras_nlp
import numpy as np
from tensorflow.keras.layers import TextVectorization
crime_and_punishment_url = 'https://www.gutenberg.org/files/2554/2554-0.txt'
brothers_of_karamazov_url = 'https://www.gutenberg.org/files/28054/28054-0.txt'
the_idiot_url = 'https://www.gutenberg.org/files/2638/2638-0.txt'
the_possessed_url = 'https://www.gutenberg.org/files/8117/8117-0.txt'
paths = [crime_and_punishment_url, brothers_of_karamazov_url, the_idiot_url, the_possessed_url]
names = ['Crime and Punishment', 'Brothers of Karamazov', 'The Idiot', 'The Possessed']
texts = ''
for index, path in enumerate(paths):
filepath = keras.utils.get_file(f'{names[index]}.txt', origin=path)
text = ''
with open(filepath, encoding='utf-8') as f:
text = f.read()
# First 50 lines are the Gutenberg intro and preface
# Skipping first 10k characters for each book should be approximately
# removing the intros and prefaces.
texts += text[10000:]
print(texts[25000:25500])
text_list = texts.split('.')
len(text_list)
len(texts.replace('\n', ' ').split(' '))
text_list = list(filter(None, text_list))
import random
random.shuffle(text_list)
length = len(text_list)
text_train = text_list[:int(0.7*length)]
text_test = text_list[int(0.7*length):int(0.85*length)]
text_valid = text_list[int(0.85*length):]
#from tensorflow.keras.layers import TextVectorization
def custom_standardization(input_string):
sentence = tf.strings.lower(input_string)
sentence = tf.strings.regex_replace(sentence, "\n", " ")
return sentence
maxlen = 50
# You can also set calculate the longest sentence in the data - 25 in this case
# maxlen = len(max(text_list).split(' '))
vectorize_layer = TextVectorization(
standardize = custom_standardization,
output_mode="int",
output_sequence_length=maxlen + 1,
)
vectorize_layer.adapt(text_list)
vocab = vectorize_layer.get_vocabulary()
vocab_size = len(vocab)
print(vocab_size )
print('DONE')```