Concatenate encoder hidden states/cells/outputs from different sources for attention calculation - issues?

Question

I am using Pytorch for an LSTM encoder-decoder sequence-to-sequence prediction problem. As a first step, I would like to forecast 2D trajectories (trajectory x, trajectory y) from multivariate input - 2-D or more (trajectory x, trajectory y, speed, rotation, etc.)

I am following the below notebook (link):

seq2seq with Attention

Here excerpts (encoder, decoder, attention):

class EncoderRNN(nn.Module):
def __init__(self, input_size, hidden_size, n_layers=1, dropout=0.1):
    super(EncoderRNN, self).__init__()

    self.input_size = input_size
    self.hidden_size = hidden_size
    self.n_layers = n_layers
    self.dropout = dropout

    self.embedding = nn.Embedding(input_size, hidden_size)
    self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=self.dropout, bidirectional=True)

def forward(self, input_seqs, input_lengths, hidden=None):
    # Note: we run this all at once (over multiple batches of multiple sequences)
    embedded = self.embedding(input_seqs)
    packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
    outputs, hidden = self.gru(packed, hidden)
    outputs, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(outputs) # unpack (back to padded)
    outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:] # Sum bidirectional outputs
    return outputs, hidden


class LuongAttnDecoderRNN(nn.Module):
def __init__(self, attn_model, hidden_size, output_size, n_layers=1, dropout=0.1):
    super(LuongAttnDecoderRNN, self).__init__()

    # Keep for reference
    self.attn_model = attn_model
    self.hidden_size = hidden_size
    self.output_size = output_size
    self.n_layers = n_layers
    self.dropout = dropout

    # Define layers
    self.embedding = nn.Embedding(output_size, hidden_size)
    self.embedding_dropout = nn.Dropout(dropout)
    self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=dropout)
    self.concat = nn.Linear(hidden_size * 2, hidden_size)
    self.out = nn.Linear(hidden_size, output_size)

    # Choose attention model
    if attn_model != 'none':
        self.attn = Attn(attn_model, hidden_size)

def forward(self, input_seq, last_hidden, encoder_outputs):
    # Note: we run this one step at a time

    # Get the embedding of the current input word (last output word)
    batch_size = input_seq.size(0)
    embedded = self.embedding(input_seq)
    embedded = self.embedding_dropout(embedded)
    embedded = embedded.view(1, batch_size, self.hidden_size) # S=1 x B x N

    # Get current hidden state from input word and last hidden state
    rnn_output, hidden = self.gru(embedded, last_hidden)

    # Calculate attention from current RNN state and all encoder outputs;
    # apply to encoder outputs to get weighted average
    attn_weights = self.attn(rnn_output, encoder_outputs)
    context = attn_weights.bmm(encoder_outputs.transpose(0, 1)) # B x S=1 x N

    # Attentional vector using the RNN hidden state and context vector
    # concatenated together (Luong eq. 5)
    rnn_output = rnn_output.squeeze(0) # S=1 x B x N -> B x N
    context = context.squeeze(1)       # B x S=1 x N -> B x N
    concat_input = torch.cat((rnn_output, context), 1)
    concat_output = F.tanh(self.concat(concat_input))

    # Finally predict next token (Luong eq. 6, without softmax)
    output = self.out(concat_output)

    # Return final output, hidden state, and attention weights (for visualization)
    return output, hidden, attn_weights

For calculating attention in the decoder stage, the encoder hidden state and encoder outputs are input and used as below:

class Attn(nn.Module):
def __init__(self, method, hidden_size):
    super(Attn, self).__init__()

    self.method = method
    self.hidden_size = hidden_size

    if self.method == 'general':
        self.attn = nn.Linear(self.hidden_size, hidden_size)

    elif self.method == 'concat':
        self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
        self.v = nn.Parameter(torch.FloatTensor(1, hidden_size))

def forward(self, hidden, encoder_outputs):
    max_len = encoder_outputs.size(0)
    this_batch_size = encoder_outputs.size(1)

    # Create variable to store attention energies
    attn_energies = Variable(torch.zeros(this_batch_size, max_len)) # B x S

    if USE_CUDA:
        attn_energies = attn_energies.cuda()

    # For each batch of encoder outputs
    for b in range(this_batch_size):
        # Calculate energy for each encoder output
        for i in range(max_len):
            attn_energies[b, i] = self.score(hidden[:, b], encoder_outputs[i, b].unsqueeze(0))

    # Normalize energies to weights in range 0 to 1, resize to 1 x B x S
    return F.softmax(attn_energies).unsqueeze(1)

def score(self, hidden, encoder_output):

    if self.method == 'dot':
        energy = hidden.dot(encoder_output)
        return energy

    elif self.method == 'general':
        energy = self.attn(encoder_output)
        energy = hidden.dot(energy)
        return energy

    elif self.method == 'concat':
        energy = self.attn(torch.cat((hidden, encoder_output), 1))
        energy = self.v.dot(energy)
        return energy

My actual goal is to extend the method by adding further information to be fed into the decoder, such as image data at each input time step. Technically, I want to use two (or more) encoders, one for the trajectories as in the link above, and one separate one for image data (convolutional encoder).

I do this by concatenating embeddings produced by the trajectory encoder and the convolutional encoder (as well as the cell states etc.) and feeding the concatenated tensors to the decoder.

For example, image embedding (256-length tensor) concatenated with trajectory data embedding (256-length tensor) yields a 512-length embedding.

My question is: is it a problem for the attention calculation if I use a concatenated encoder hidden state, concatenated encoder cell state, and concatenated encoder output coming from those different sources rather than hidden states, cells, outputs coming from a single source?

What are the caveats or pre-processing that should happen to make this work?

Thank you very much in advance.

Concatenate encoder hidden states/cells/outputs from different sources for attention calculation - issues?

0 Answers0