The above code is more or less what the Keras documentation gives us as a language model. The thing is that this language model predicts characters, not words. Strictly speaking, a language model is supposed to predict full words.
My question is, how do I change this in order to predict full words?
from __future__ import print_function
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
path = "C:/Users/Cedric Oeldorf/Desktop/University/Research/Data/Gutenberg/MYDATAFINAL3.txt"
text = open(path).read().lower()
print('corpus length:', len(text))
chars = set(text)
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
sentences.append(text[i: i + maxlen])
next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))
print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
for t, char in enumerate(sentence):
X[i, t, char_indices[char]] = 1
y[i, char_indices[next_chars[i]]] = 1
from keras.callbacks import History
histLSTM = History()
# build the model: 2 stacked LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(512, return_sequences=True, input_shape=(maxlen, len(chars))))
model.add(Dropout(0.2))
model.add(LSTM(512, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
model.fit(X, y, batch_size=128, nb_epoch=4, callbacks=[histLSTM])
My data preprocessing idea so far is:
path = "C:/MYDATAFINAL3.txt"
text = open(path).read().lower()
print('corpus length:', len(text))
#tokenize corpus and get list of unique words
tok = gensim.utils.simple_preprocess(text, deacc=False)
words = set(tok)
word_indices = dict((c, i) for i, c in enumerate(words))
indices_word = dict((i, c) for i, c in enumerate(words))
sentences1 = text.split('.')
SYMBOLS = '{}()[].,:;+-*/&|<>=~$'
m = [item.translate(None, SYMBOLS).strip() for item in sentences1]
del text
maxlen = 60
step = 3
sentences = []
next_words = []
for i in range(0, len(tok) - maxlen, step):
sentences.append(tok[i: i + maxlen])
next_words.append(tok[i + maxlen])
print('nb sequences:', len(sentences))
X = np.zeros((len(sentences), maxlen), dtype="int32")
y = np.zeros((len(sentences),maxlen), dtype="int32")
This step isnt working out:
#In X, change boolean to true for every listed character, same for y
for i, sentence in enumerate(sentences):
for t, words in enumerate(sentence):
X[i, t,] = word_indices[words]
y[i, t] = word_indices[words]
And I don't know what input shape I should be using:
print('Build model...')
model = Sequential()
model.add(GRU(512, return_sequences=True, input_shape=(len(sentences), maxlen)))
model.add(Dropout(0.2))
model.add(GRU(512, return_sequences=True))
model.add(Dropout(0.2))
#model.add(Dense(len(chars)))
#Insert this instead:
model.add(TimeDistributedDense(len(words)))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
model.fit(X, y, batch_size=128, nb_epoch=2)