Theano , recurrent neural network, error is nan

Question

I am trying to duplicate the recent work on unitary evolution neural networks. Adapting from the code published by the author, I have written the following code

import matplotlib.pyplot as plt
import numpy as np
import theano
import theano.tensor as T
import sys

#theano.config.exception_verbosity='high'
class RNN(object):
    def __init__(self,n_in,n_out,n_hid,learning_rate = 0.000001):
        self.dtype = theano.config.floatX
        self.learning_rate = learning_rate
        self.n_in = n_in
        self.n_hid = n_hid
        self.n_out = n_out
        self.generate_parameters()
        self.params = [self.V_re, self.V_im, self.U, self.hidden_bias, self.reflection, self.out_bias, self.theta, self.scale]
        inp = T.matrix(dtype = self.dtype)
        target = T.matrix(dtype = self.dtype)
        h_0_batch = self.h_0
        non_sequences = [self.theta, self.V_re, self.V_im, self.hidden_bias, self.scale, self.out_bias, self.U]
        sequences = inp
        self.index_permute = np.random.permutation(self.n_hid)
        h_t,_ = theano.scan(fn=self.one_step,sequences=sequences,non_sequences=non_sequences,outputs_info=h_0_batch)


        y_t = T.dot(h_t[self.h_t.shape[0]-1], self.U) + self.out_bias
        self.p_y_given_x = T.nnet.softmax(y_t)
        #cost = T.nnet.categorical_crossentropy(y, y_t).mean()
        self.y_t = T.argmax(self.p_y_given_x, axis = 1)
        self.lr = theano.shared(np.cast[self.dtype](self.learning_rate))
        self.cost = -T.sum(target*T.log(self.p_y_given_x) + (1.- target)*T.log(1. - self.p_y_given_x))
        self.learn_rnn_fn = self.get_train_graph(target, inp, self.cost)
        self.pred_rnn_fn = self.get_pred_graph(inp)
        print "Built model"



    def do_fft(self,input, n_hidden):
        fft_input = T.reshape(input, (input.shape[0], 2, n_hidden))
        fft_input = fft_input.dimshuffle(0,2,1)
        fft_output = cufft(fft_input) / T.sqrt(n_hidden)
        fft_output = fft_output.dimshuffle(0,2,1)
        output = T.reshape(fft_output, (input.shape[0], 2*n_hidden))
        return output

    def do_ifft(self,input, n_hidden):
        ifft_input = T.reshape(input, (input.shape[0], 2, n_hidden))
        ifft_input = ifft_input.dimshuffle(0,2,1)
        ifft_output = cuifft(ifft_input) / T.sqrt(n_hidden)
        ifft_output = ifft_output.dimshuffle(0,2,1)
        output = T.reshape(ifft_output, (input.shape[0], 2*n_hidden))
        return output


    def scale_diag(self,input, n_hidden, diag):
        input_re = input[:, :n_hidden]
        input_im = input[:, n_hidden:]
        Diag = T.nlinalg.AllocDiag()(diag)
        input_re_times_Diag = T.dot(input_re, Diag)
        input_im_times_Diag = T.dot(input_im, Diag)
        return T.concatenate([input_re_times_Diag, input_im_times_Diag], axis=1)

    def times_diag(self,input, n_hidden, diag):
        input_re = input[:, :n_hidden]
        input_im = input[:, n_hidden:]
        Re = T.nlinalg.AllocDiag()(T.cos(diag))
        Im = T.nlinalg.AllocDiag()(T.sin(diag))
        input_re_times_Re = T.dot(input_re, Re)
        input_re_times_Im = T.dot(input_re, Im)
        input_im_times_Re = T.dot(input_im, Re)
        input_im_times_Im = T.dot(input_im, Im)

        return T.concatenate([input_re_times_Re - input_im_times_Im,
                          input_re_times_Im + input_im_times_Re], axis=1)

    def vec_permutation(self,input, n_hidden, index_permute):
        re = input[:, :n_hidden]
        im = input[:, n_hidden:]
        re_permute = re[:, index_permute]
        im_permute = im[:, index_permute]

        return T.concatenate([re_permute, im_permute], axis=1)      

    def times_reflection(self,input, n_hidden, reflection):
        input_re = input[:, :n_hidden]
        input_im = input[:, n_hidden:]
        reflect_re = reflection[:n_hidden]
        reflect_im = reflection[n_hidden:]
        vstarv = (reflect_re**2 + reflect_im**2).sum()
        input_re_reflect = input_re - 2 / vstarv * (T.outer(T.dot(input_re, reflect_re), reflect_re) 
                                                    + T.outer(T.dot(input_re, reflect_im), reflect_im) 
                                                    - T.outer(T.dot(input_im, reflect_im), reflect_re) 
                                                    + T.outer(T.dot(input_im, reflect_re), reflect_im))
        input_im_reflect = input_im - 2 / vstarv * (T.outer(T.dot(input_im, reflect_re), reflect_re) 
                                                    + T.outer(T.dot(input_im, reflect_im), reflect_im) 
                                                    + T.outer(T.dot(input_re, reflect_im), reflect_re) 
                                                    - T.outer(T.dot(input_re, reflect_re), reflect_im))

        return T.concatenate([input_re_reflect, input_im_reflect], axis=1)      

    def sample_weights(self,SizeX, SizeY):
        values = np.ndarray([SizeX, SizeY], dtype = self.dtype)
        for dx in range(SizeX):
            row_val = np.random.normal(loc = 0.0, scale = 0.1, size=(SizeY,))
            values[dx,:] = row_val
        _,svs,_ = np.linalg.svd(values)
        values = values / svs[0]
        return values

    def generate_parameters(self):
        np.random.seed(1234)
        rng = np.random.RandomState(1234)
        self.V_re = theano.shared(self.sample_weights(self.n_in, self.n_hid))
        self.V_im = theano.shared(self.sample_weights(self.n_in, self.n_hid))
        self.U = theano.shared(self.sample_weights(2 * self.n_hid, self.n_out))
        self.hidden_bias = theano.shared(np.asarray(rng.uniform(low=-0.01,
                                                       high=0.01,
                                                       size=(self.n_hid,)),
                                               dtype=self.dtype))

        self.reflection = theano.shared(self.sample_weights(2, 2*self.n_hid))
        self.out_bias = theano.shared(np.zeros((self.n_out,), dtype=self.dtype))
        self.theta = theano.shared(self.sample_weights(3, self.n_hid))
        bucket = np.sqrt(2.) * np.sqrt(3. / 2 / self.n_hid)
        self.h_0 = theano.shared(np.asarray(rng.uniform(low=-bucket,
                                               high=bucket,
                                               size=(1, 2 * self.n_hid)), 
                                       dtype=self.dtype),
                        name='h_0')

        self.scale = theano.shared(np.zeros((self.n_hid,), dtype=self.dtype))

    def logistic_function(self,vec):
        return 1/(1 + T.exp(-vec))
    def activ_tan(self,vec):
        return T.tanh(vec)
    def one_step(self,x_t, h_prev, theta, V_re, V_im, hidden_bias, scale, out_bias, U):
                # Compute hidden linear transform
        step1 = self.times_diag(h_prev, self.n_hid, self.theta[0,:])
        step2 = step1
#        step2 = do_fft(step1, n_hidden)
        step3 = self.times_reflection(step2, self.n_hid, self.reflection[0,:])
        step4 = self.vec_permutation(step3, self.n_hid, self.index_permute)
        step5 = self.times_diag(step4, self.n_hid, theta[1,:])
        step6 = step5
#        step6 = do_ifft(step5, n_hidden)
        step7 = self.times_reflection(step6, self.n_hid, self.reflection[1,:])
        step8 = self.times_diag(step7, self.n_hid, self.theta[2,:])     
        step9 = self.scale_diag(step8, self.n_hid, self.scale)

        hidden_lin_output = step9
        # Compute data linear transform
        data_lin_output_re = T.dot(x_t, V_re)
        data_lin_output_im = T.dot(x_t, V_im)
        data_lin_output = T.concatenate([data_lin_output_re, data_lin_output_im], axis=0)

        # Total linear output        
        lin_output = hidden_lin_output + data_lin_output
        lin_output_re = lin_output[:, :self.n_hid]
        lin_output_im = lin_output[:, self.n_hid:] 


        # Apply non-linearity ----------------------------


        # scale RELU nonlinearity
        modulus = T.sqrt(lin_output_re ** 2 + lin_output_im ** 2)
        rescale = T.maximum(modulus + hidden_bias.dimshuffle('x',0), 0.) / (modulus + 1e-5)
        nonlin_output_re = lin_output_re * rescale
        nonlin_output_im = lin_output_im * rescale

        h_t = T.concatenate([nonlin_output_re, 
                             nonlin_output_im], axis=1)
        return h_t

        #cost = -T.sum(T.log(np.abs(p_y_given_x))[0][T.argmax(target)])
    def get_train_graph(self, target, inp, cost):
        grads = []
        for param in self.params:
            grads.append(T.grad(cost, param))
        update = []
        for param,grad in zip(self.params, grads):
            update.append((param, param - grad*self.lr))
        train_fn = theano.function(inputs = [inp,target], outputs = cost, updates = update)
        return train_fn

    def get_pred_graph(self,inp):
        predictions = theano.function(inputs = [inp], outputs = self.y_t)
        return predictions

def convert_string(file):
    f = open(file,'r')
    text = f.read()
    f.close()
    inp = np.zeros([len(text), 256],dtype=theano.config.floatX)
    out = np.zeros([len(text), 256],dtype=theano.config.floatX)
    counter = 0
    for char in text:
        if(counter > 0):
            inp[counter][ord(char)] = 1
            out[counter-1][ord(char)] = 1
        counter = counter + 1
    return [inp, out]

train_data = convert_string("log")
j = 0
model = RNN(256,256,1000)
n_epoch=10
dtype = dtype = theano.config.floatX
def train_rnn(train_data, n_epoch = 100):
    train_err = np.ndarray(n_epoch)
    for i in range(n_epoch):
        for j in range(len(train_data[0])):
            tempInp = np.zeros([1,256],dtype=dtype);
            tempInp[0] = train_data[0][j]
            tempOut = np.zeros([1,256],dtype=dtype);
            tempOut[0] = train_data[1][j]
            train_cost = model.learn_rnn_fn(tempInp, tempOut)
            sys.stdout.write((model.pred_rnn_fn(tempInp)))
            train_err[i]=train_err[i]+ train_cost
            train_err[i]= train_err[i]/len(train_data[0])
            print train_cost
        print "\n"
    return train_err

train_errors = train_rnn(train_data, n_epoch)
print train_errors
def plot_learning_curve(train_err):
    plt.plot(np.arange(n_epoch), train_errors, 'b-')
    plt.xlabel('epochs')
    plt.ylabel('error')
    plt.show()
plot_learning_curve(train_errors)

But after an initial error of 6.23 , I am getting all nans. Could someone explain if there is some error in the code. I posted the whole code so that I do not miss out on the erroneous section(and I dont know which one it is)

Thanks!

score 0 · Answer 1 · answered May 04 '16 at 00:03

While Marcin Możejko was probably right about the source of the NaNs beieng numerical problems with the logs, theano's doc gives good general advice on dealing with NaNs.

In particular, how to use NaN guard - which will throw an error as soon as a tensor variable contains NaNs:

from theano.compile.nanguardmode import NanGuardMode

...
... = theano.function(..., mode=NanGuardMode(nan_is_error=True,
                                             inf_is_error=True,
                                             big_is_error=True)
                     )

Theano , recurrent neural network, error is nan

1 Answers1