Based on the LSTM code provided in the official Theano tutorial (http://deeplearning.net/tutorial/code/lstm.py), I changed the LSTM layer code (i.e. the functions lstm_layer()
and param_init_lstm()
) to perform a GRU instead.
The provided LSTM code trains well, but not the GRU I coded: the accuracy on the training set with the LSTM goes up to 1 (train cost = 0), while with the GRU it stagnates at 0.7 (train cost = 0.3).
Below is the code I use for the GRU. I kept the same function names as in tutorial, so that one can copy paste the code directly in it. What could explain the poor performance of the GRU?
import numpy as np
def param_init_lstm(options, params, prefix='lstm'):
"""
GRU
"""
W = np.concatenate([ortho_weight(options['dim_proj']), # Weight matrix for the input in the reset gate
ortho_weight(options['dim_proj']),
ortho_weight(options['dim_proj'])], # Weight matrix for the input in the update gate
axis=1)
params[_p(prefix, 'W')] = W
U = np.concatenate([ortho_weight(options['dim_proj']), # Weight matrix for the previous hidden state in the reset gate
ortho_weight(options['dim_proj']),
ortho_weight(options['dim_proj'])], # Weight matrix for the previous hidden state in the update gate
axis=1)
params[_p(prefix, 'U')] = U
b = np.zeros((3 * options['dim_proj'],)) # Biases for the reset gate and the update gate
params[_p(prefix, 'b')] = b.astype(config.floatX)
return params
def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None):
nsteps = state_below.shape[0]
if state_below.ndim == 3:
n_samples = state_below.shape[1]
else:
n_samples = 1
def _slice(_x, n, dim):
if _x.ndim == 3:
return _x[:, :, n * dim:(n + 1) * dim]
return _x[:, n * dim:(n + 1) * dim]
def _step(m_, x_, h_):
preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
preact += x_
r = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj'])) # reset gate
u = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj'])) # update gate
U_h_t = _slice( tparams[_p(prefix, 'U')], 2, options['dim_proj'])
x_h_t = _slice( x_, 2, options['dim_proj'])
h_t_temp = tensor.tanh(tensor.dot(r*h_, U_h_t) + x_h_t)
h = (1. - u) * h_ + u * h_t_temp
h = m_[:,None] * h + (1. - m_)[:,None] * h_
return h
state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) +
tparams[_p(prefix, 'b')])
dim_proj = options['dim_proj']
rval, updates = theano.scan(_step,
sequences=[mask, state_below],
outputs_info=[tensor.alloc(numpy_floatX(0.),
n_samples,
dim_proj)],
name=_p(prefix, '_layers'),
n_steps=nsteps)
return rval[0]