How to generate sequence correctly with encoder-decoder lstm?

Question

I am implementing some code to generate labeled data for Natural Language Understanding (NLU) from the article "Labeled Data Generation with Encoder-decoder LSTM for Semantic Slot Filling" (https://pdfs.semanticscholar.org/7ffe/83d7dd3a474e15ccc2aef412009f100a5802.pdf). My architecture is a simple encoder-decoder LSTM, but since my generated sentences (for words and labels) are not correct, I am trying to generate exactly the same sentence (only words) I give as input. Unfortunately, this is not working correctly.

I am using a vord2vec for word-embedding and the dimension of the embeddings is set to 64 (as suggested in the article). The encoder LSTM is receiving the sequence in reversed order and with a dropout rate of 0.5. The decoder LSTM also has a dropout rate of 0.5 and a softmax layer for each output of the sequence to map the most probable word. The inputs are exactly the same that the targets (same sentences) since first I want to produce exactly the same sentence.

For training, I used Adam optimizer and categorical_crossentropy for loss. For inference, I used a beam search (B=3) when generating sequences.

My training code:

def pretrained_embedding_layer(emb):
    vocab_len = len(emb)
    emb_dim = len(emb[0])
    emb_layer = Embedding(vocab_len, emb_dim, trainable = False)
    emb_layer.build((None,))
    emb_layer.set_weights([emb])

    return emb_layer

LSTM_encoder = LSTM(1024, dropout=0.5, return_state=True, go_backwards=True, name='lstm_encoder')
LSTM_decoder = LSTM(1024, dropout=0.5, return_sequences=True, return_state=True, name='lstm_decoder')
dense_w = Dense(vocab_w_size, activation='softmax', name="word_output")

K.set_learning_phase(1)

def model1_enc_dec(input_shape, w_emb):
     words_indices = Input(shape=input_shape, dtype='int32')
     wemb_layer = pretrained_embedding_layer(w_emb)
     wemb = wemb_layer(words_indices)
     enc_out, enc_state_h, enc_state_c = LSTM_encoder(wemb)
     encoder_states = [enc_state_h, enc_state_c]
     dec_out, dec_state_h, dec_state_c = LSTM_decoder(wemb, 
     initial_state=encoder_states)
     dec_out = dense_w(dec_out)
     model1 = Model(inputs=[words_indices], outputs=[dec_out])

     return model1

model = model1_enc_dec((maxlen,), w_emb, s_emb)
model.summary()
model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['accuracy'])
    model.fit(train_w, train_lab_w, validation_data=(val_w, val_lab_w), epochs=epochs, verbose=1, shuffle=True)

My inference code:

wemb_layer = Embedding(len(w_emb), len(w_emb[0]), trainable=False)
wemb_layer.build((None,))
LSTM_encoder = LSTM(1024, return_state=True, go_backwards=True, name='lstm_encoder')
LSTM_decoder = LSTM(1024, return_sequences=True, return_state=True, name='lstm_decoder')
dense_w = Dense(vocab_w_size, activation='softmax', name="word_output")

def target_model(input_shape):
    words_indices = Input(shape=input_shape, dtype='int32')
    wemb = wemb_layer(words_indices)
    enc_out, enc_state_h, enc_state_c = LSTM_encoder(wemb)
    encoder_states = [enc_state_h, enc_state_c]
    dec_out, dec_state_h, dec_state_c = LSTM_decoder(wemb, 
    initial_state=encoder_states)
    dec_out = dense_w(dec_out)
    model = Model(inputs=[words_indices], outputs=[dec_out])

    return model

target_model = target_model((maxlen,))
wemb_layer.set_weights(model1.layers[1].get_weights()) # layer 0: input
LSTM_encoder.set_weights(model1.layers[2].get_weights())
LSTM_decoder.set_weights(model1.layers[3].get_weights())
dense_w.set_weights(model1.layers[4].get_weights())

def model1_enco_infe(input_shape):
    words_indices = Input(shape=input_shape, dtype='int32')
    wemb = wemb_layer(words_indices)
    enc_out, enc_state_h, enc_state_c = LSTM_encoder(wemb)
    encoder_model = Model(inputs=[words_indices], outputs=[enc_state_h, 
    enc_state_c])

    return encoder_model

def model1_deco_infe(input_shape):
    dec_word_input = Input(shape=input_shape, dtype='int32')
    dec_state_input_h = Input(shape=(1024,))
    dec_state_input_c = Input(shape=(1024,))
    wemb = wemb_layer(dec_word_input)
    dec_states_input = [dec_state_input_h, dec_state_input_c]
    dec_out, dec_state_h, dec_state_c = LSTM_decoder(wemb, 
    initial_state=dec_states_input)
    dec_states_output = [dec_state_h, dec_state_c]
    deco_out = dense_w(dec_out)
    decoder_model = Model(inputs=[dec_word_input] + dec_states_input, outputs= 
    [deco_out] + dec_states_output)

    return decoder_model

encoder_model = model1_enco_infe((maxlen,))
decoder_model = model1_deco_infe((1,))

def beamsearch_B(deco_w_out, beam):
    words_index = []
    dw = deco_w_out.copy()
    for i in range(beam):
        word_index = np.argmax(dw, axis=-1)
        dw[0][0][word_index[0][0]] = 0
        words_index.append(word_index[0][0])

    return words_index

def generate_model1_add(word_seq, encoder_model, decoder_model, dec_word_input, id2word, beam):
    [enc_state_h, enc_state_c] = encoder_model.predict(word_seq)
    states = [enc_state_h, enc_state_c]
    word_sentence = ''
    probs_word = []
    word_sentences = []
    dec_word_inputs = []
    states_beam = []
    stop_condition = False

    [dec_w_out, dec_state_h, dec_state_c] = 
    decoder_model.predict([dec_word_input] + states)
    words_index, _ = beamsearch_B(dec_w_out, [], beam)

    for i in range(beam):
        probs_word.append(-log(dec_w_out[0][0][words_index[i]]))
        word_sentences.append(id2word[words_index[i]])
        dec_word_inputs.append([words_index[i]])
        states_beam.append([dec_state_h, dec_state_c])

        n_words = 1
        endgame = []

        while not stop_condition:
            words_indexes, words_sentences, probs_words, states_b = [], [], 
            [], []
            for k in range(beam):
                [dec_w_out, dec_state_h, dec_state_c] = 
                decoder_model.predict([dec_word_inputs[k]] + states_beam[k])
                words_index, _ = beamsearch_B(dec_w_out, [], beam)
                states = [dec_state_h, dec_state_c]

                for j in range(beam):
                    words_indexes.append(words_index[j])
                    probs_words.append(probs_word[k] * -log(dec_w_out[0][0] 
                    [words_index[j]]) + 1e-7)
                    words_sentences.append(word_sentences[k] + ' ' + 
                    id2word[words_index[j]])
                    states_b.append(states)

            probs = []
            for i in range(len(probs_words)):
                probs.append(1 / (probs_words[i]))
            indexes = []
            for i in range(beam):
                index = np.argmax(probs, axis=-1)
                probs[index] = 0
                indexes.append(index)

            for i in range(beam):
                probs_word[i] = probs_words[indexes[i]]
                word_sentences[i] = words_sentences[indexes[i]]
                dec_word_inputs[i] = [words_indexes[indexes[i]]]
                states_beam[i] = states_b[indexes[i]]
                if (id2word[words_indexes[indexes[i]]] == 'EOS'):
                    endgame.append(i)

            if len(endgame) == 1:
                word_sentence = word_sentences[endgame]
                stop_condition = True
            elif len(endgame) > 1:
                word_sentence = word_sentences[np.min(endgame)]
                stop_condition = True

            n_words += 1

            if n_words > 50:
                word_sentence = word_sentences[0]
                stop_condition = True

    return word_sentence

word_sentence = generate_model1_add(np.reshape(train_w[i], (1, maxlen)), 
                encoder_model, 0, decoder_model, [w2i['BOS']], i2w, 3)

An example of my generated sequences:

Input sentence: BOS i 'm fourth in flying from boston to atlanta EOS PAD PAD PAD ... Generated sentence: BOS from from from from from from from from from from from from from from from from from from from ...

It seems that the training weights are not correct but I got loss: 0.0032 - acc: 0.9990 - val_loss: 0.0794 - val_acc: 0.9888 during training.

What I want is just to generate exactly the same sentence of the input. Hope you can help me guys. Thank you in advance!

How to generate sequence correctly with encoder-decoder lstm?

0 Answers0