theano GRU rnn adam optimizer

Question

Technical information:

OS: Mac OS X 10.9.5

IDE: Eclipse Mars.1 Release (4.5.1), with PyDev and Anaconda interpreter (grammar version 3.4)

GPU: NVIDIA GeForce GT 650M

Libs: numpy, aeosa, Sphinx-1.3.1, Theano 0.7, nltk-3.1

My background: I am very new to theano and numpy and haven't taken a formal course in machine learning or discrete math.

The recurrent neural network for natural language processing I currently use is taken from here:

https://github.com/dennybritz/rnn-tutorial-gru-lstm/blob/master/gru_theano.py

The only change made to this file is replacing references to theano.config.floatX with the string 'float32'.

I also use the utils.py and train.py modules included in the repository, with only minor changes.

The adam optimizer I plan to incorporate in place of the sgd/rms code implemented in the example repository is found here: https://gist.github.com/skaae/ae7225263ca8806868cb

Reproduced here (again with references to the .config.floatX replaced with the hard-coded 'float32'):

(theano as th, theano.shared as thsh, theano.tensor as T, numpy as np)

def adam(loss, all_params, learning_rate=0.001, b1=0.9, b2=0.999, e=1e-8, gamma=1-1e-8):
    """
    ADAM update rules
    Default values are taken from [Kingma2014]

    References:
    [Kingma2014] Kingma, Diederik, and Jimmy Ba.
    "Adam: A Method for Stochastic Optimization."
    arXiv preprint arXiv:1412.6980 (2014).
    http://arxiv.org/pdf/1412.6980v4.pdf
    """

    updates = []
    all_grads = th.grad(loss, all_params)
    alpha = learning_rate
    t = thsh(np.float32(1))
    b1_t = b1*gamma**(t-1)   #(Decay the first moment running average coefficient)

    for theta_previous, g in zip(all_params, all_grads):
        m_previous = thsh(np.zeros(theta_previous.get_value().shape.astype('float32')))
        v_previous = thsh(np.zeros(theta_previous.get_value().shape.astype('float32')))

        m = b1_t*m_previous + (1 - b1_t)*g  # (Update biased first moment estimate)
        v = b2*v_previous + (1 - b2)*g**2   # (Update biased second raw moment estimate)
        m_hat = m / (1-b1**t)               # (Compute bias-corrected first moment estimate)
        v_hat = v / (1-b2**t)               # (Compute bias-corrected second raw moment estimate)
        theta = theta_previous - (alpha * m_hat) / (T.sqrt(v_hat) + e) #(Update parameters)

        updates.append((m_previous, m))
        updates.append((v_previous, v))
        updates.append((theta_previous, theta) )
    updates.append((t, t + 1.))
    return updates

My question is this:

How would you modify the GRUTheano module to use the Adam method above in place of the builtin sgd/rmsprop function?

It looks like the key changes would be to lines 99-126 of GRUTheano:

    # SGD parameters
    learning_rate = T.scalar('learning_rate')
    decay = T.scalar('decay')

    # rmsprop cache updates
    mE = decay * self.mE + (1 - decay) * dE ** 2
    mU = decay * self.mU + (1 - decay) * dU ** 2
    mW = decay * self.mW + (1 - decay) * dW ** 2
    mV = decay * self.mV + (1 - decay) * dV ** 2
    mb = decay * self.mb + (1 - decay) * db ** 2
    mc = decay * self.mc + (1 - decay) * dc ** 2

    self.sgd_step = theano.function(
        [x, y, learning_rate, theano.Param(decay, default=0.9)],
        [], 
        updates=[(E, E - learning_rate * dE / T.sqrt(mE + 1e-6)),
                 (U, U - learning_rate * dU / T.sqrt(mU + 1e-6)),
                 (W, W - learning_rate * dW / T.sqrt(mW + 1e-6)),
                 (V, V - learning_rate * dV / T.sqrt(mV + 1e-6)),
                 (b, b - learning_rate * db / T.sqrt(mb + 1e-6)),
                 (c, c - learning_rate * dc / T.sqrt(mc + 1e-6)),
                 (self.mE, mE),
                 (self.mU, mU),
                 (self.mW, mW),
                 (self.mV, mV),
                 (self.mb, mb),
                 (self.mc, mc)
                ])

score 0 · Answer 1 · answered Jan 06 '16 at 23:11

I haven't tested this code, but the only thing you need to change is to tell updates to use adam(..) instead of the updates already provided here, so something like this should work (complete code looks like this (we need to get rid of rmsprop stuff)):

import numpy as np
import theano as theano
import theano.tensor as T
from theano.gradient import grad_clip
import time
import operator

class GRUTheano(object):
    def __init__(self, word_dim, hidden_dim=128, bptt_truncate=-1):
        # Assign instance variables
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate
        # Initialize the network parameters
        E = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
        U = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (6, hidden_dim, hidden_dim))
        W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (6, hidden_dim, hidden_dim))
        V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
        b = np.zeros((6, hidden_dim))
        c = np.zeros(word_dim)
        # Theano: Created shared variables
        self.E = theano.shared(name='E', value=E.astype(theano.config.floatX))
        self.U = theano.shared(name='U', value=U.astype(theano.config.floatX))
        self.W = theano.shared(name='W', value=W.astype(theano.config.floatX))
        self.V = theano.shared(name='V', value=V.astype(theano.config.floatX))
        self.b = theano.shared(name='b', value=b.astype(theano.config.floatX))
        self.c = theano.shared(name='c', value=c.astype(theano.config.floatX))
        # We store the Theano graph here
        self.theano = {}
        self.__theano_build__()

    def __theano_build__(self):
        E, V, U, W, b, c = self.E, self.V, self.U, self.W, self.b, self.c

        x = T.ivector('x')
        y = T.ivector('y')

        def forward_prop_step(x_t, s_t1_prev, s_t2_prev):
            # This is how we calculated the hidden state in a simple RNN. No longer!
            # s_t = T.tanh(U[:,x_t] + W.dot(s_t1_prev))

            # Word embedding layer
            x_e = E[:,x_t]

            # GRU Layer 1
            z_t1 = T.nnet.hard_sigmoid(U[0].dot(x_e) + W[0].dot(s_t1_prev) + b[0])
            r_t1 = T.nnet.hard_sigmoid(U[1].dot(x_e) + W[1].dot(s_t1_prev) + b[1])
            c_t1 = T.tanh(U[2].dot(x_e) + W[2].dot(s_t1_prev * r_t1) + b[2])
            s_t1 = (T.ones_like(z_t1) - z_t1) * c_t1 + z_t1 * s_t1_prev

            # GRU Layer 2
            z_t2 = T.nnet.hard_sigmoid(U[3].dot(s_t1) + W[3].dot(s_t2_prev) + b[3])
            r_t2 = T.nnet.hard_sigmoid(U[4].dot(s_t1) + W[4].dot(s_t2_prev) + b[4])
            c_t2 = T.tanh(U[5].dot(s_t1) + W[5].dot(s_t2_prev * r_t2) + b[5])
            s_t2 = (T.ones_like(z_t2) - z_t2) * c_t2 + z_t2 * s_t2_prev

            # Final output calculation
            # Theano's softmax returns a matrix with one row, we only need the row
            o_t = T.nnet.softmax(V.dot(s_t2) + c)[0]

            return [o_t, s_t1, s_t2]

    [o, s, s2], updates = theano.scan(
        forward_prop_step,
        sequences=x,
        truncate_gradient=self.bptt_truncate,
        outputs_info=[None,
                      dict(initial=T.zeros(self.hidden_dim)),
                      dict(initial=T.zeros(self.hidden_dim))])

    prediction = T.argmax(o, axis=1)
    o_error = T.sum(T.nnet.categorical_crossentropy(o, y))

    # Total cost (could add regularization here)
    cost = o_error

    # Gradients
    dE = T.grad(cost, E)
    dU = T.grad(cost, U)
    dW = T.grad(cost, W)
    db = T.grad(cost, b)
    dV = T.grad(cost, V)
    dc = T.grad(cost, c)

    # Assign functions
    self.predict = theano.function([x], o)
    self.predict_class = theano.function([x], prediction)
    self.ce_error = theano.function([x, y], cost)
    self.bptt = theano.function([x, y], [dE, dU, dW, db, dV, dc])

    self.params = [self.E, self.U, self.W, self.V, self.b, self.c]

    updates=adam(cost, self.params)
    self.sgd_step = theano.function(
        inputs=[x, y],
        outputs=[],
        updates=updates
    )


def calculate_total_loss(self, X, Y):
    return np.sum([self.ce_error(x,y) for x,y in zip(X,Y)])


def calculate_loss(self, X, Y):
    # Divide calculate_loss by the number of words
    num_words = np.sum([len(y) for y in Y])
    return self.calculate_total_loss(X,Y)/float(num_words)


def adam(loss, all_params, learning_rate=0.001, b1=0.9, b2=0.999, e=1e-8,
     gamma=1-1e-8):
    """
    ADAM update rules
    Default values are taken from [Kingma2014]

    References:
    [Kingma2014] Kingma, Diederik, and Jimmy Ba.
    "Adam: A Method for Stochastic Optimization."
    arXiv preprint arXiv:1412.6980 (2014).
    http://arxiv.org/pdf/1412.6980v4.pdf

    """
    updates = []
    all_grads = theano.grad(loss, all_params)
    alpha = learning_rate
    t = theano.shared(np.float32(1))
    b1_t = b1*gamma**(t-1)   #(Decay the first moment running average coefficient)

    for theta_previous, g in zip(all_params, all_grads):
        m_previous =   theano.shared(np.zeros(theta_previous.get_value().shape,
                                        dtype=theano.config.floatX))
        v_previous = theano.shared(np.zeros(theta_previous.get_value().shape,
                                        dtype=theano.config.floatX))

        m = b1_t*m_previous + (1 - b1_t)*g                             # (Update biased first moment estimate)
        v = b2*v_previous + (1 - b2)*g**2                              # (Update biased second raw moment estimate)
        m_hat = m / (1-b1**t)                                          # (Compute bias-corrected first moment estimate)
        v_hat = v / (1-b2**t)                                          # (Compute bias-corrected second raw moment estimate)
        theta = theta_previous - (alpha * m_hat) / (T.sqrt(v_hat) + e) #(Update parameters)

        updates.append((m_previous, m))
        updates.append((v_previous, v))
        updates.append((theta_previous, theta) )
    updates.append((t, t + 1.))
    return updates

theano GRU rnn adam optimizer

1 Answers1