I am trying to implement this paper: Deep Learning for Answer Sentence Selection and more accurately the Bigram model using theano's scan function like this:
Tl = theano.shared(...)
Tr = theano.shared(...)
b = theano.shared(...)
s = T.matrix('s')
results, updates = theano.scan(
lambda t_0, t_p1, prior_result, Tl, Tr, b: prior_result + T.tanh(T.dot(Tl, t_0) + T.dot(Tr, t_m1) + b),
sequences = dict(input = s, taps = [0, 1]),
outputs_info = T.zeros_like(b, dtype = 'float64')
non_sequences = [Tl, Tr, b],
strict = True
)
final_result = ifelse(T.eq(s.shape[0],1), s[0], result[-1])
My problem is that some answers are just one word long so s is a vector instead of a matrix. This causes problem with the scan function since the +1 tap of the sequence is missing. To handle this I want to use the ifelse statement.
Now my first question:
Is this even possible? Or will the scan always be evaluated and the ifelse only decides afterwards which value to use?
second question:
how can I make tensor.eq(a,b) return a scalar? Because I get the following error message:
TypeError: Condition given to the op has to be a scalar with 0 standing for False, anything else for True
EDIT some code that triggers an index error because ifelse will not be lazy-evaluated if theano computes the gradients
import numpy as np
import theano
import theano.tensor as T
from theano.ifelse import ifelse
def trainBigram(q, a, y, seed = 8024, lRate = 0.1, maxEpochs = 1,
modelSize = 3):
# q: list (all questions) of lists(words per question) of list (values of word embedding)
# a: list (all questions) of lists(1 possible answers) of lists(words per possible answer) of list (values of word embedding)
# y: list (all questions) of list (correctness of answer)
# trains on triplets of(q(i), a(i,j), y(i,j)) instead of softmaxing all 4 answer possibilities
# uses SGD
np.random.seed(seed)
# theano variables
q_t = T.matrix('q_t')
a_t = T.matrix('a_t')
y_t = T.scalar('y_t')
# initialize weights (distribution from paper)
M = theano.shared(np.random.normal(0,0.01,(modelSize,modelSize)))
bm = theano.shared(np.random.normal(0,0.01,1))
Tl = theano.shared(np.random.normal(0,0.01,(modelSize,modelSize)))
Tr = theano.shared(np.random.normal(0,0.01,(modelSize,modelSize)))
bt = theano.shared(np.random.normal(0,0.01,modelSize))
# define graph for Bigram Model
q_em_scan, scanUpdatesQ = theano.scan(
lambda t_0, t_p1, prior_result, Tl, Tr, bt: prior_result + T.tanh(T.dot(Tl, t_0) + T.dot(Tr, t_p1) + bt),
sequences = dict(input = q_t, taps = [0, 1]),
outputs_info = T.zeros_like(bt, dtype = 'float64'),
non_sequences = [Tl, Tr, bt],
strict = True
)
q_em = q_em_scan[-1]
a_em_scan, scanUpdatesA = theano.scan(
lambda t_0, t_p1, prior_result, Tl, Tr, bt: prior_result + T.tanh(T.dot(Tl, t_0) + T.dot(Tr, t_p1) + bt),
sequences = dict(input = a_t, taps = [0, 1]),
outputs_info = T.zeros_like(bt, dtype = 'float64'),
non_sequences = [Tl, Tr, bt],
strict = True
)
# printing calculated values to check for lazy evaluation of ifelse
MultipleWords= theano.printing.Print('multiple Words answer')(a_em_scan[-1])
OneWord = theano.printing.Print('1 Word answer')(a_t[0])
a_em = ifelse(T.eq(a_t.shape[0], 1), OneWord, MultipleWords)
# define graph for Question/Answer model
prob = 1 / (1 + T.exp( - T.dot(T.dot(M, a_em), q_em) + bm))
xent = - y_t * T.log(prob) - (1 - y_t) * T.log((1 - prob))
loss = xent.sum()
g_M,g_bm, g_Tl, g_Tr, g_bt = T.grad(loss, [M, bm, Tl, Tr, bt])
updates = (
(M, M - lRate * g_M),
(bm, bm - lRate * g_bm),
(Tl, Tl - lRate * g_Tl),
(Tr, Tr - lRate * g_Tr),
(bt, bt - lRate * g_bt)
)
# compile function
train = theano.function(
inputs = [q_t, a_t, y_t],
outputs = prob,
updates = updates
)
# training
for question, answers, labels in zip(q,a,y):
# for triplets instead of softmax
for answer, label in zip(answers, labels):
answer = np.asarray(answer)
if (answer.shape[0] == 1):
print "!!! One-Word-Answer !!!"
print "shape:", answer.shape
prob = train(question, answer, label)
print prob
def main():
questionOne = [[1,2,3],[1,2,3]]
answerOne_One = [[1,2,3], [1,2,3]]
answerOne_Two = [[1,2,3], [1,2,3]]
answersOne = [answerOne_One, answerOne_Two]
correctnessOne = [0,1]
questionTwo = [[4,5,6],[4,5,6]]
answerTwo_One = [[4,5,6]]
answerTwo_Two = [[4,5,6]]
answersTwo = [answerTwo_One, answerTwo_Two]
correctnessTwo = [1,0]
q = [questionOne, questionTwo]
a = [answersOne, answersTwo]
y = [correctnessOne, correctnessTwo]
trainBigram(q,a,y)
main()