[TF 1.8] I'm trying to build a seq2seq model for a toy chatbot to learn about tensorflow and deep learning. I was able to train and run the model with sampled softmax and beam search but then I try to apply tf.contrib.seq2seq.LuongAttention using tf.contrib.seq2seq.AttentionWrapper I get the following error while building the graph:
ValueError: Dimensions must be equal, but are 384 and 256 for 'rnn/while/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/MatMul_2' (op: 'MatMul') with input shapes: [64,384], [256,512].
This is my model:
class ChatBotModel:
def __init__(self, inferring=False, batch_size=1, use_sample_sofmax=True):
"""forward_only: if set, we do not construct the backward pass in the model.
"""
print('Initialize new model')
self.inferring = inferring
self.batch_size = batch_size
self.use_sample_sofmax = use_sample_sofmax
def build_graph(self):
# INPUTS
self.X = tf.placeholder(tf.int32, [None, None])
self.Y = tf.placeholder(tf.int32, [None, None])
self.X_seq_len = tf.placeholder(tf.int32, [None])
self.Y_seq_len = tf.placeholder(tf.int32, [None])
self.gl_step = tf.Variable(
0, dtype=tf.int32, trainable=False, name='global_step')
single_cell = tf.nn.rnn_cell.BasicLSTMCell(128)
keep_prob = tf.cond(tf.convert_to_tensor(self.inferring, tf.bool), lambda: tf.constant(
1.0), lambda: tf.constant(0.8))
single_cell = tf.contrib.rnn.DropoutWrapper(
single_cell, output_keep_prob=keep_prob)
encoder_cell = tf.contrib.rnn.MultiRNNCell([single_cell for _ in range(2)])
# ENCODER
encoder_out, encoder_state = tf.nn.dynamic_rnn(
cell = encoder_cell,
inputs = tf.contrib.layers.embed_sequence(self.X, 10000, 128),
sequence_length = self.X_seq_len,
dtype = tf.float32)
# encoder_state is ((cell0_c, cell0_h), (cell1_c, cell1_h))
# DECODER INPUTS
after_slice = tf.strided_slice(self.Y, [0, 0], [self.batch_size, -1], [1, 1])
decoder_inputs = tf.concat( [tf.fill([self.batch_size, 1], 2), after_slice], 1)
# ATTENTION
attention_mechanism = tf.contrib.seq2seq.LuongAttention(
num_units = 128,
memory = encoder_out,
memory_sequence_length = self.X_seq_len)
# DECODER COMPONENTS
Y_vocab_size = 10000
decoder_cell = tf.contrib.rnn.MultiRNNCell([single_cell for _ in range(2)])
decoder_cell = tf.contrib.seq2seq.AttentionWrapper(
cell = decoder_cell,
attention_mechanism = attention_mechanism,
attention_layer_size=128)
decoder_embedding = tf.Variable(tf.random_uniform([Y_vocab_size, 128], -1.0, 1.0))
projection_layer = CustomDense(Y_vocab_size)
if self.use_sample_sofmax:
softmax_weight = projection_layer.kernel
softmax_biases = projection_layer.bias
if not self.inferring:
# TRAINING DECODER
training_helper = tf.contrib.seq2seq.TrainingHelper(
inputs = tf.nn.embedding_lookup(decoder_embedding, decoder_inputs),
sequence_length = self.Y_seq_len,
time_major = False)
decoder_initial_state = decoder_cell.zero_state(self.batch_size, dtype=tf.float32).clone(
cell_state=encoder_state)
training_decoder = tf.contrib.seq2seq.BasicDecoder(
cell = decoder_cell,
helper = training_helper,
initial_state = decoder_initial_state,
output_layer = projection_layer
)
training_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
decoder = training_decoder,
impute_finished = True,
maximum_iterations = tf.reduce_max(self.Y_seq_len))
training_logits = training_decoder_output.rnn_output
# LOSS
softmax_loss_function = None
if self.use_sample_sofmax:
def sampled_loss(labels, logits):
labels = tf.reshape(labels, [-1, 1])
return tf.nn.sampled_softmax_loss(weights=softmax_weight,
biases=softmax_biases,
labels=labels,
inputs=logits,
num_sampled=64,
num_classes=10000)
softmax_loss_function = sampled_loss
masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)
self.loss = tf.contrib.seq2seq.sequence_loss(logits = training_logits, targets = self.Y, weights = masks, softmax_loss_function=softmax_loss_function)
# BACKWARD
params = tf.trainable_variables()
gradients = tf.gradients(self.loss, params)
clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
self.train_op = tf.train.AdamOptimizer().apply_gradients(zip(clipped_gradients, params), global_step=self.gl_step)
else:
encoder_states = []
for i in range(2):
if isinstance(encoder_state[i],tf.contrib.rnn.LSTMStateTuple):
encoder_state_c = tf.contrib.seq2seq.tile_batch(encoder_state[i].c, multiplier=2)
encoder_state_h = tf.contrib.seq2seq.tile_batch(encoder_state[i].h, multiplier=2)
encoder_state = tf.contrib.rnn.LSTMStateTuple(c=encoder_state_c, h=encoder_state_h)
encoder_states.append(encoder_state)
encoder_states = tuple(encoder_states)
predicting_decoder = tf.contrib.seq2seq.BeamSearchDecoder(
cell = decoder_cell,
embedding = decoder_embedding,
start_tokens = tf.tile(tf.constant([2], dtype=tf.int32), [self.batch_size]),
end_token = 3,
initial_state = decoder_initial_state,
beam_width = 2,
output_layer = projection_layer,
length_penalty_weight = 0.0)
predicting_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
decoder = predicting_decoder,
impute_finished = False,
maximum_iterations = 4 * tf.reduce_max(self.Y_seq_len))
self.predicting_logits = predicting_decoder_output.predicted_ids
Tracing back a few lines of log and I saw that the error occurs here:
/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/rnn_cell_impl.py in call(self, inputs, state)
636
637 gate_inputs = math_ops.matmul(
--> 638 array_ops.concat([inputs, h], 1), self._kernel)
639 gate_inputs = nn_ops.bias_add(gate_inputs, self._bias)
I have checked the 'h' tensor of the LSTM cell and it has the shape of [batch_size, 128] so my guess is that the attention output from the previous decoding step is concatenated with the current encoder's input make the 'inputs' has the shape of [batch_size, 256] then it is concatenated with 'h' tensor to form a [batch_size, 384] tensor causing this error.
My question is: Isn't attention output supposed to be concatenated with the next decoder's input or I miss understanding anything? And how to fix this error.