How do I change the Keras text generation example from being on character level to word level?

Question

The above code is more or less what the Keras documentation gives us as a language model. The thing is that this language model predicts characters, not words. Strictly speaking, a language model is supposed to predict full words.

My question is, how do I change this in order to predict full words?

from __future__ import print_function
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys

path = "C:/Users/Cedric     Oeldorf/Desktop/University/Research/Data/Gutenberg/MYDATAFINAL3.txt"
text = open(path).read().lower()
print('corpus length:', len(text))

chars = set(text)
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

from keras.callbacks import History
histLSTM = History()

# build the model: 2 stacked LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(512, return_sequences=True, input_shape=(maxlen, len(chars))))
model.add(Dropout(0.2))
model.add(LSTM(512, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

model.fit(X, y, batch_size=128, nb_epoch=4, callbacks=[histLSTM])

My data preprocessing idea so far is:

path = "C:/MYDATAFINAL3.txt"
text = open(path).read().lower()
print('corpus length:', len(text))

#tokenize corpus and get list of unique words
tok = gensim.utils.simple_preprocess(text, deacc=False)
words = set(tok)
word_indices = dict((c, i) for i, c in enumerate(words))
indices_word = dict((i, c) for i, c in enumerate(words))

sentences1 = text.split('.')
SYMBOLS = '{}()[].,:;+-*/&|<>=~$'
m = [item.translate(None, SYMBOLS).strip() for item in sentences1]
del text

maxlen = 60
step = 3
sentences = []
next_words = []
for i in range(0, len(tok) - maxlen, step):
    sentences.append(tok[i: i + maxlen])
    next_words.append(tok[i + maxlen])
print('nb sequences:', len(sentences))

X = np.zeros((len(sentences), maxlen), dtype="int32")
y = np.zeros((len(sentences),maxlen), dtype="int32")

This step isnt working out:

#In X, change boolean to true for every listed character, same for y
for i, sentence in enumerate(sentences):
    for t, words in enumerate(sentence):
        X[i, t,] = word_indices[words]
    y[i, t] = word_indices[words]

And I don't know what input shape I should be using:

print('Build model...')
model = Sequential()
model.add(GRU(512, return_sequences=True, input_shape=(len(sentences), maxlen)))
model.add(Dropout(0.2))
model.add(GRU(512, return_sequences=True))
model.add(Dropout(0.2))
#model.add(Dense(len(chars)))
#Insert this instead:
model.add(TimeDistributedDense(len(words)))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
model.fit(X, y, batch_size=128, nb_epoch=2)

This is late, but for the sake of guidance: as you are doing here your X is prepared to be an input of an Embedding layer (ie input are indices of words), if you do not use embedding, use one hot encoding (but you might not have the memory to store such large matrix). With Embedding, `input_length` would be `maxlen`, without, `input_shape` is `(maxlen, len(words))`. Your Y, if you use `categorical_crossentropy` **must be one hot-encoded** (so shape `(len(sentences), maxlen, len(words))`). And use `next_words` to build `Y`. — H. Rev., Sep 07 '17 at 16:05

How do I change the Keras text generation example from being on character level to word level?

0 Answers0