Tensorflow: Attention output gets concatenated with the next decoder input causing dimension missmatch in seq2seq model

Question

[TF 1.8] I'm trying to build a seq2seq model for a toy chatbot to learn about tensorflow and deep learning. I was able to train and run the model with sampled softmax and beam search but then I try to apply tf.contrib.seq2seq.LuongAttention using tf.contrib.seq2seq.AttentionWrapper I get the following error while building the graph:

ValueError: Dimensions must be equal, but are 384 and 256 for 'rnn/while/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/MatMul_2' (op: 'MatMul') with input shapes: [64,384], [256,512].

This is my model:

class ChatBotModel:

def __init__(self, inferring=False, batch_size=1, use_sample_sofmax=True):
    """forward_only: if set, we do not construct the backward pass in the model.
    """
    print('Initialize new model')
    self.inferring = inferring
    self.batch_size = batch_size
    self.use_sample_sofmax = use_sample_sofmax


    def build_graph(self):
        # INPUTS
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None, None])
        self.X_seq_len = tf.placeholder(tf.int32, [None])
        self.Y_seq_len = tf.placeholder(tf.int32, [None])


        self.gl_step = tf.Variable(
                      0, dtype=tf.int32, trainable=False, name='global_step')

        single_cell = tf.nn.rnn_cell.BasicLSTMCell(128)
        keep_prob = tf.cond(tf.convert_to_tensor(self.inferring, tf.bool), lambda: tf.constant(
            1.0), lambda: tf.constant(0.8))
        single_cell = tf.contrib.rnn.DropoutWrapper(
            single_cell, output_keep_prob=keep_prob)
        encoder_cell = tf.contrib.rnn.MultiRNNCell([single_cell for _ in range(2)])

        # ENCODER         
        encoder_out, encoder_state = tf.nn.dynamic_rnn(
            cell = encoder_cell, 
            inputs = tf.contrib.layers.embed_sequence(self.X, 10000, 128),
            sequence_length = self.X_seq_len,
            dtype = tf.float32)
        # encoder_state is ((cell0_c, cell0_h), (cell1_c, cell1_h))

        # DECODER INPUTS
        after_slice = tf.strided_slice(self.Y, [0, 0], [self.batch_size, -1], [1, 1])
        decoder_inputs = tf.concat( [tf.fill([self.batch_size, 1], 2), after_slice], 1)

        # ATTENTION
        attention_mechanism = tf.contrib.seq2seq.LuongAttention(
            num_units = 128, 
            memory = encoder_out,
            memory_sequence_length = self.X_seq_len)

        # DECODER COMPONENTS
        Y_vocab_size = 10000
        decoder_cell = tf.contrib.rnn.MultiRNNCell([single_cell for _ in range(2)])
        decoder_cell = tf.contrib.seq2seq.AttentionWrapper(
            cell = decoder_cell,
            attention_mechanism = attention_mechanism,
            attention_layer_size=128)
        decoder_embedding = tf.Variable(tf.random_uniform([Y_vocab_size, 128], -1.0, 1.0))
        projection_layer = CustomDense(Y_vocab_size)
        if self.use_sample_sofmax:
            softmax_weight = projection_layer.kernel
            softmax_biases = projection_layer.bias

        if not self.inferring:
            # TRAINING DECODER
            training_helper = tf.contrib.seq2seq.TrainingHelper(
                inputs = tf.nn.embedding_lookup(decoder_embedding, decoder_inputs),
                sequence_length = self.Y_seq_len,
                time_major = False)

            decoder_initial_state = decoder_cell.zero_state(self.batch_size, dtype=tf.float32).clone(
                cell_state=encoder_state)

            training_decoder = tf.contrib.seq2seq.BasicDecoder(
                cell = decoder_cell,
                helper = training_helper,
                initial_state = decoder_initial_state,
                output_layer = projection_layer
            )
            training_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder = training_decoder,
                impute_finished = True,
                maximum_iterations = tf.reduce_max(self.Y_seq_len))
            training_logits = training_decoder_output.rnn_output

            # LOSS
            softmax_loss_function = None
            if self.use_sample_sofmax:
                def sampled_loss(labels, logits):
                    labels = tf.reshape(labels, [-1, 1])
                    return tf.nn.sampled_softmax_loss(weights=softmax_weight,
                                                      biases=softmax_biases,
                                                      labels=labels,
                                                      inputs=logits,
                                                      num_sampled=64,
                                                      num_classes=10000)
                softmax_loss_function = sampled_loss

            masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)
            self.loss = tf.contrib.seq2seq.sequence_loss(logits = training_logits, targets = self.Y, weights = masks, softmax_loss_function=softmax_loss_function)

            # BACKWARD
            params = tf.trainable_variables()
            gradients = tf.gradients(self.loss, params)
            clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
            self.train_op = tf.train.AdamOptimizer().apply_gradients(zip(clipped_gradients, params), global_step=self.gl_step)
        else:
            encoder_states = []
            for i in range(2):
                if isinstance(encoder_state[i],tf.contrib.rnn.LSTMStateTuple):
                    encoder_state_c = tf.contrib.seq2seq.tile_batch(encoder_state[i].c, multiplier=2)
                    encoder_state_h = tf.contrib.seq2seq.tile_batch(encoder_state[i].h, multiplier=2)
                    encoder_state = tf.contrib.rnn.LSTMStateTuple(c=encoder_state_c, h=encoder_state_h)
                encoder_states.append(encoder_state)
            encoder_states = tuple(encoder_states)

            predicting_decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                cell = decoder_cell,
                embedding = decoder_embedding,
                start_tokens = tf.tile(tf.constant([2], dtype=tf.int32), [self.batch_size]),
                end_token = 3,
                initial_state = decoder_initial_state,
                beam_width = 2,
                output_layer = projection_layer,
                length_penalty_weight = 0.0)
            predicting_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder = predicting_decoder,
                impute_finished = False,
                maximum_iterations = 4 * tf.reduce_max(self.Y_seq_len))
            self.predicting_logits = predicting_decoder_output.predicted_ids

Tracing back a few lines of log and I saw that the error occurs here:

/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/rnn_cell_impl.py in call(self, inputs, state)
    636 
    637     gate_inputs = math_ops.matmul(
--> 638         array_ops.concat([inputs, h], 1), self._kernel)
    639     gate_inputs = nn_ops.bias_add(gate_inputs, self._bias)

I have checked the 'h' tensor of the LSTM cell and it has the shape of [batch_size, 128] so my guess is that the attention output from the previous decoding step is concatenated with the current encoder's input make the 'inputs' has the shape of [batch_size, 256] then it is concatenated with 'h' tensor to form a [batch_size, 384] tensor causing this error.

My question is: Isn't attention output supposed to be concatenated with the next decoder's input or I miss understanding anything? And how to fix this error.

score 0 · Answer 1 · answered Oct 15 '18 at 09:25

you probably already found the answer but for peeps (like me) who also encounter this error, focus on the second shape. It specifies [256,512]. Now open up the code to "rnn_cell_impl.py" and go to the line where the concat op is taking place. You will notice that the kernel shape is the one being reported as being out of sync with your decoder input( which has num_units+attention_layer_size as the 1st dimension , 0th being your batch_size).

Basically you are using the same cell you created for the encoder unit in the decoder as well (its a 2 layer lstm with 128 right?) hence the kernel size shows up as 256,512. To fix this, in the line between these 2, add

Y_vocab_size = 10000
## create new decoder base rnn cell 
decode_op_cell = tf.nn.rnn_cell.BasicLSTMCell(128)
## create new decoder base rnn cell
decoder_cell = tf.contrib.rnn.MultiRNNCell([decode_op_cell for _ in range(2)])

Now if you can visualize the code at the same line which gave you the error, you will see [64, 384] and [384, 512] ( which is a legit mat mul op and should fix your error) Of course, whatever dropout etc you want to add, feel free to add to this decode_op_cell as well.

Tensorflow: Attention output gets concatenated with the next decoder input causing dimension missmatch in seq2seq model

1 Answers1