I am working with Chainer1.22.0 to implement a LSTM language model. My code does work with CPU but does NOT with GPU... More precisely, my object function converges pretty fast with CPU but not with GPU. Do you have any idea about what's going on?
Thank you for your help in advance!
Outputs:
$ python debug.py --gpu -1
cpu mode
objective in epoch 0 : 14.8049154282
objective in epoch 1 : 11.7126655579
objective in epoch 2 : 10.6166152954
objective in epoch 3 : 9.81489753723
objective in epoch 4 : 8.90626144409
objective in epoch 5 : 7.73007297516
objective in epoch 6 : 6.31889343262
objective in epoch 7 : 4.83179998398
objective in epoch 8 : 3.52315592766
objective in epoch 9 : 2.58598852158
$ python debug.py --gpu 0
gpu mode 0
objective in epoch 0 : 14.8049144745
objective in epoch 1 : 14.3081817627
objective in epoch 2 : 14.0404243469
objective in epoch 3 : 13.8618173599
objective in epoch 4 : 13.7236022949
objective in epoch 5 : 13.6082553864
objective in epoch 6 : 13.5111179352
objective in epoch 7 : 13.4323377609
objective in epoch 8 : 13.3735141754
objective in epoch 9 : 13.3361949921
Environments:
- Python 2.7.13 (anaconda)
- Chainer1.22.0
- Cuda8.0
My model:
min -sum log P(s_t|s_{t<})
where P(s_t | s{t<}) = LSTM( s_{t<} )
Complete code:
'''
this is a code for asking
'''
import numpy as np
try:
import cupy as xp
except ImportError:
pass
import sys
import chainer as ch
import chainer.links as L
import chainer.functions as F
INT = "int32"
FLOAT="float32"
BOOLEAN='bool'
class LSTM(ch.Chain):
def __init__(self, voc_size, in_size, out_size, batch_size):
np.random.seed(0)
w1 = np.random.normal(size=[voc_size, in_size])
super(LSTM, self).__init__(
emb=L.EmbedID(voc_size, in_size, initialW =w1), # word embedding
enc = L.LSTM(in_size=in_size, out_size=out_size),# LSTM_cell
scores = L.Linear(out_size, voc_size) # output transformation
)
self.batch_size = batch_size
self.out_size = out_size
self.gpu_idx = -1
#put links on GPU
def to_gpu(self, device_idx):
self.gpu_idx = device_idx
self.emb.to_gpu(device_idx)
self.scores.to_gpu(device_idx)
self.enc.to_gpu(device_idx)
def obj(self, seq):
#object function is log likelyhood for the each word on the seq
return -F.sum(self.logL(seq))
def logL(self, seq):
'''
seq ; batch of src seq of length T : List<List<int>>
RETRUN : R^{batch_size x T} : CP node
'''
T = xp if self.gpu_idx>=0 else np
padded = T.transpose(T.array(seq, dtype=INT)) #Z^{T x batch_size}
#reset LSTM cell
self.enc.reset_state()
logL = []
#logL for each time step except the first input
for i in range(0, len(padded)-1):
#get LSTM output
h = self.enc(self.emb(padded[i])) #R^{batch_size x hidden_size}
#probability distribution over vocabrary
s = self.scores(F.tanh(h)) #R^{batch_size x voc_size}
s = F.transpose(F.log_softmax(s)) #R^{voc_size x batch_size}
#likelyhood for the next word
l = F.embed_id(padded[i+1] , s) #R^{batch_size x batch_size}
l = F.sum(l * T.identity(self.batch_size), axis=0) #R^{batch_size}
logL += [l]
return F.transpose(F.stack(logL))
GPU_TAG = "--gpu"
if __name__=="__main__":
args= sys.argv
gpu_idx = -1
i=0
#argument
while i<len(args):
if args[i]==GPU_TAG:
i+=1
gpu_idx = int(args[i])
i+=1
#hyper paramters
voc_size = 5
batch_size=3
in_size = 5
out_size=2
#instanciate model
model = LSTM(voc_size, in_size, out_size, batch_size)
#GPU mode or CPU mode
if gpu_idx>=0:
print "gpu mode ", gpu_idx
ch.cuda.get_device(gpu_idx).use()
model.to_gpu(gpu_idx)
else:
print "cpu mode"
#prepare optimizer
trainer = ch.optimizers.sgd.SGD(lr=0.3)
trainer.setup(model)
#seq to train
x = [[1,2,3,4]]*batch_size
#main training loop
for epoch in range(10):
obj = model.obj(x) #forward path
model.cleargrads() #init grad for backward path
obj.backward() #backward path
print "objective in epoch ",epoch, ": ", obj.data
trainer.update() #update