Different convergence rate of the objective with GPU and Chianer

Question

I am working with Chainer1.22.0 to implement a LSTM language model. My code does work with CPU but does NOT with GPU... More precisely, my object function converges pretty fast with CPU but not with GPU. Do you have any idea about what's going on?

Thank you for your help in advance!

Outputs:

$ python debug.py --gpu -1
cpu mode
objective in epoch  0 :  14.8049154282
objective in epoch  1 :  11.7126655579
objective in epoch  2 :  10.6166152954
objective in epoch  3 :  9.81489753723
objective in epoch  4 :  8.90626144409
objective in epoch  5 :  7.73007297516
objective in epoch  6 :  6.31889343262
objective in epoch  7 :  4.83179998398
objective in epoch  8 :  3.52315592766
objective in epoch  9 :  2.58598852158

$ python debug.py --gpu 0
gpu mode  0
objective in epoch  0 :  14.8049144745
objective in epoch  1 :  14.3081817627
objective in epoch  2 :  14.0404243469
objective in epoch  3 :  13.8618173599
objective in epoch  4 :  13.7236022949
objective in epoch  5 :  13.6082553864
objective in epoch  6 :  13.5111179352
objective in epoch  7 :  13.4323377609
objective in epoch  8 :  13.3735141754
objective in epoch  9 :  13.3361949921

Environments:

Python 2.7.13 (anaconda)
Chainer1.22.0
Cuda8.0

My model:

min -sum log P(s_t|s_{t<})
where P(s_t | s{t<}) = LSTM( s_{t<} )

Complete code:

'''
this is a code for asking
'''
import numpy as np
try:
    import cupy as xp
except ImportError:
    pass
import sys
import chainer as ch
import chainer.links as L
import chainer.functions as F

INT = "int32"
FLOAT="float32"
BOOLEAN='bool'



class LSTM(ch.Chain):
    def __init__(self, voc_size, in_size, out_size, batch_size):
        np.random.seed(0)
        w1 = np.random.normal(size=[voc_size, in_size])

        super(LSTM, self).__init__(
            emb=L.EmbedID(voc_size, in_size, initialW =w1),  # word embedding
            enc = L.LSTM(in_size=in_size, out_size=out_size),# LSTM_cell
            scores = L.Linear(out_size, voc_size)            # output transformation
        )
        self.batch_size = batch_size
        self.out_size = out_size
        self.gpu_idx = -1

    #put links on GPU
    def to_gpu(self, device_idx):
        self.gpu_idx = device_idx

        self.emb.to_gpu(device_idx)
        self.scores.to_gpu(device_idx)
        self.enc.to_gpu(device_idx)


    def obj(self, seq):
        #object function is log likelyhood for the each word on the seq
        return -F.sum(self.logL(seq))


    def logL(self, seq):
        '''
        seq ; batch of src seq of length T : List<List<int>>
        RETRUN : R^{batch_size x T} : CP node
        '''
        T = xp if self.gpu_idx>=0 else np
        padded = T.transpose(T.array(seq, dtype=INT)) #Z^{T x batch_size}

        #reset LSTM cell
        self.enc.reset_state()

        logL = []
        #logL for each time step except the first input
        for i in range(0, len(padded)-1):
            #get LSTM output
            h = self.enc(self.emb(padded[i])) #R^{batch_size x hidden_size}

            #probability distribution over vocabrary
            s = self.scores(F.tanh(h))        #R^{batch_size x voc_size}
            s = F.transpose(F.log_softmax(s)) #R^{voc_size x batch_size}

            #likelyhood for the next word
            l = F.embed_id(padded[i+1] , s)     #R^{batch_size x batch_size}
            l = F.sum(l * T.identity(self.batch_size), axis=0) #R^{batch_size}
            logL += [l]

        return F.transpose(F.stack(logL))




GPU_TAG = "--gpu"
if __name__=="__main__":
    args= sys.argv
    gpu_idx = -1
    i=0

    #argument
    while i<len(args):
        if args[i]==GPU_TAG:
            i+=1
            gpu_idx = int(args[i])
        i+=1

    #hyper paramters
    voc_size = 5
    batch_size=3
    in_size = 5
    out_size=2

    #instanciate model
    model = LSTM(voc_size, in_size, out_size, batch_size)

    #GPU mode or CPU mode
    if gpu_idx>=0:
        print "gpu mode ", gpu_idx
        ch.cuda.get_device(gpu_idx).use()
        model.to_gpu(gpu_idx)
    else:
        print "cpu mode"

    #prepare optimizer
    trainer = ch.optimizers.sgd.SGD(lr=0.3)
    trainer.setup(model)

    #seq to train
    x = [[1,2,3,4]]*batch_size

    #main training loop
    for epoch in range(10):
        obj = model.obj(x)  #forward path
        model.cleargrads()  #init grad for backward path
        obj.backward()      #backward path
        print "objective in epoch ",epoch, ": ", obj.data
        trainer.update()    #update

score 0 · Answer 1 · answered Apr 14 '17 at 12:20

I solved the problem by myself somehow... it seems my some operation for the computational graph works with CPU but not with GPU.

The problem is solved by replacing the following code

s = F.transpose(F.log_softmax(s)) #R^{voc_size x batch_size}
#likelyhood for the next word
l = F.embed_id(padded[i+1] , s) 
l = F.sum(l * T.identity(self.batch_size), axis=0)

with

s=F.log_softmax(s)
l=F.select_item(s,padded[i+1])

Thank you for your all help!

Different convergence rate of the objective with GPU and Chianer

Outputs:

Environments:

My model:

Complete code:

1 Answers1