I am currently working on a question answering system. I create a synthetic dataset that contains multiple words in the answers. But, the answers are not a span of the given context.
Initially, I am planning to test it using a deep learning-based model. But I have some problems building the model. This is how I vectorized data.
def vectorize(data, word2idx, story_maxlen, question_maxlen, answer_maxlen):
""" Create the story and question vectors and the label """
Xs, Xq, Y = [], [], []
for story, question, answer in data:
xs = [word2idx[word] for word in story]
xq = [word2idx[word] for word in question]
y = [word2idx[word] for word in answer]
#y = np.zeros(len(word2idx) + 1)
#y[word2idx[answer]] = 1
Xs.append(xs)
Xq.append(xq)
Y.append(y)
return (pad_sequences(Xs, maxlen=story_maxlen),
pad_sequences(Xq, maxlen=question_maxlen),
pad_sequences(Y, maxlen=answer_maxlen))
#np.array(Y))
below is how I create the model.
# story encoder. Output dim: (None, story_maxlen, EMBED_HIDDEN_SIZE)
story_encoder = Sequential()
story_encoder.add(Embedding(input_dim=vocab_size,
output_dim=EMBED_HIDDEN_SIZE,
input_length=story_maxlen))
story_encoder.add(Dropout(0.3))
# question encoder. Output dim: (None, question_maxlen, EMBED_HIDDEN_SIZE)
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_size,
output_dim=EMBED_HIDDEN_SIZE,
input_length=question_maxlen))
question_encoder.add(Dropout(0.3))
# episodic memory (facts): story * question
# Output dim: (None, question_maxlen, story_maxlen)
facts_encoder = Sequential()
facts_encoder.add(Merge([story_encoder, question_encoder],
mode="dot", dot_axes=[2, 2]))
facts_encoder.add(Permute((2, 1)))
## combine response and question vectors and do logistic regression
answer = Sequential()
answer.add(Merge([facts_encoder, question_encoder],
mode="concat", concat_axis=-1))
answer.add(LSTM(LSTM_OUTPUT_SIZE, return_sequences=True))
answer.add(Dropout(0.3))
answer.add(Flatten())
answer.add(Dense(vocab_size,activation= "softmax"))
answer.compile(optimizer="rmsprop", loss="categorical_crossentropy",
metrics=["accuracy"])
answer.fit([Xs_train, Xq_train], Y_train,
batch_size=BATCH_SIZE, nb_epoch=NBR_EPOCHS,
validation_data=([Xs_test, Xq_test], Y_test))
and this is the summary of the model
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
merge_46 (Merge) (None, 5, 616) 0
_________________________________________________________________
lstm_23 (LSTM) (None, 5, 32) 83072
_________________________________________________________________
dropout_69 (Dropout) (None, 5, 32) 0
_________________________________________________________________
flatten_9 (Flatten) (None, 160) 0
_________________________________________________________________
dense_22 (Dense) (None, 37) 5957
=================================================================
Total params: 93,765.0
Trainable params: 93,765.0
Non-trainable params: 0.0
_________________________________________________________________
It gives the following error.
ValueError: Error when checking model target: expected dense_22 to have shape (None, 37) but got array with shape (1000, 2)
I think the error is related to Y_train, Y_test. I should encode them to categorical values and the answers are not spans of text, but sequential. I don't know what/how to do it. how can I fix it? any ideas?
EDIT:
When I use sparse_categorical_crossentropy in the loss, and Reshape(2,-1); answer.summary()
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
merge_94 (Merge) (None, 5, 616) 0
_________________________________________________________________
lstm_65 (LSTM) (None, 5, 32) 83072
_________________________________________________________________
dropout_139 (Dropout) (None, 5, 32) 0
_________________________________________________________________
reshape_22 (Reshape) (None, 2, 80) 0
_________________________________________________________________
dense_44 (Dense) (None, 2, 37) 2997
=================================================================
Total params: 90,805.0
Trainable params: 90,805.0
Non-trainable params: 0.0
_________________________________________________________________
EDIT2: The model after modifications
# story encoder. Output dim: (None, story_maxlen, EMBED_HIDDEN_SIZE)
story_encoder = Sequential()
story_encoder.add(Embedding(input_dim=vocab_size,
output_dim=EMBED_HIDDEN_SIZE,
input_length=story_maxlen))
story_encoder.add(Dropout(0.3))
# question encoder. Output dim: (None, question_maxlen, EMBED_HIDDEN_SIZE)
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_size,
output_dim=EMBED_HIDDEN_SIZE,
input_length=question_maxlen))
question_encoder.add(Dropout(0.3))
# episodic memory (facts): story * question
# Output dim: (None, question_maxlen, story_maxlen)
facts_encoder = Sequential()
facts_encoder.add(Merge([story_encoder, question_encoder],
mode="dot", dot_axes=[2, 2]))
facts_encoder.add(Permute((2, 1)))
## combine response and question vectors and do logistic regression
## combine response and question vectors and do logistic regression
answer = Sequential()
answer.add(Merge([facts_encoder, question_encoder],
mode="concat", concat_axis=-1))
answer.add(LSTM(LSTM_OUTPUT_SIZE, return_sequences=True))
answer.add(Dropout(0.3))
#answer.add(Flatten())
answer.add(keras.layers.Reshape((2, -1)))
answer.add(Dense(vocab_size,activation= "softmax"))
answer.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy",
metrics=["accuracy"])
answer.fit([Xs_train, Xq_train], Y_train,
batch_size=BATCH_SIZE, nb_epoch=NBR_EPOCHS,
validation_data=([Xs_test, Xq_test], Y_test))
It still gives
ValueError: Error when checking model target: expected dense_46 to have 3 dimensions, but got array with shape (1000, 2)