I'm trying to reimplement the torch.nn.RNN
module without C++/CUDA bindings, i.e., using simple tensor operations and associated logic. I have developed the following RNN class and associated testing logic, which can be used to compare output with a reference module instance:
import torch
import torch.nn as nn
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, bidirectional=False):
super(RNN, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.bidirectional = bidirectional
self.w_ih = [torch.randn(hidden_size, input_size)]
if bidirectional:
self.w_ih_reverse = [torch.randn(hidden_size, input_size)]
for layer in range(num_layers - 1):
self.w_ih.append(torch.randn(hidden_size, hidden_size))
if bidirectional:
self.w_ih_reverse.append(torch.randn(hidden_size, hidden_size))
self.w_hh = torch.randn(num_layers, hidden_size, hidden_size)
if bidirectional:
self.w_hh_reverse = torch.randn(num_layers, hidden_size, hidden_size)
def forward(self, input, h_0=None):
if h_0 is None:
if self.bidirectional:
h_0 = torch.zeros(2, self.num_layers, input.shape[1], self.hidden_size)
else:
h_0 = torch.zeros(1, self.num_layers, input.shape[1], self.hidden_size)
if self.bidirectional:
output = torch.zeros(input.shape[0], input.shape[1], 2 * self.hidden_size)
else:
output = torch.zeros(input.shape[0], input.shape[1], self.hidden_size)
for t in range(input.shape[0]):
print(input.shape, t)
input_t = input[t]
if self.bidirectional:
input_t_reversed = input[-1 - t]
for layer in range(self.num_layers):
h_t = torch.tanh(torch.matmul(input_t, self.w_ih[layer].T) + torch.matmul(h_0[0][layer], self.w_hh[layer].T))
h_0[0][layer] = h_t
if self.bidirectional:
h_t_reverse = torch.tanh(torch.matmul(input_t_reversed, self.w_ih_reverse[layer].T) + torch.matmul(h_0[1][layer], self.w_hh_reverse[layer].T))
h_0[1][layer] = h_t_reverse
input_t = h_t
if self.bidirectional:
# This logic is incorrect for bidirectional RNNs with multiple layers
input_t = torch.cat((h_t, h_t_reverse), dim=-1)
input_t_reversed = input_t
output[t, :, :self.hidden_size] = h_t
if self.bidirectional:
output[-1 - t, :, self.hidden_size:] = h_t_reverse
return output
if __name__ == '__main__':
input_size = 10
hidden_size = 12
num_layers = 2
batch_size = 2
bidirectional = True
input = torch.randn(2, batch_size, input_size)
rnn_val = torch.nn.RNN(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, bias=False, bidirectional=bidirectional, nonlinearity='tanh')
rnn = RNN(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional)
for i in range(rnn_val.num_layers):
rnn.w_ih[i] = rnn_val._parameters['weight_ih_l%d' % i].data
rnn.w_hh[i] = rnn_val._parameters['weight_hh_l%d' % i].data
if bidirectional:
rnn.w_ih_reverse[i] = rnn_val._parameters['weight_ih_l%d_reverse' % i].data
rnn.w_hh_reverse[i] = rnn_val._parameters['weight_hh_l%d_reverse' % i].data
output_val, hn_val = rnn_val(input)
output = rnn(input)
print(output_val)
print(output)
My implementation appears to work for vanilla RNNs with an arbitrary number of layers and different batch sizes/sequence lengths, in addition to single-layered bidirectional RNNs, however, it does not produce the correct result for multi-layered bidirectional RNNs.
For sake of simplicity, bias terms are not currently implemented, and only the tanh activation function is supported. I have narrowed the logic error down to the line input_t = torch.cat((h_t, h_t_reverse), dim=-1)
, as the first output sequence is incorrect.
It would be greatly appreciated if someone could point me in the correct direction, and let me know what the problem is!