When I train a model, the score will become "Nan", like this:
def forward(self, input):
alias_inputs, A, items, mask, targets = input
hidden = self.embedding(items)
seq_output = self.seq_modeling(alias_inputs, A, hidden, mask).unsqueeze(1) #[batch,1, dim]
target_output =self.embedding(targets) #[batch,2, dim]
score = (seq_output * target_output).sum(-1)
output = score.view(-1,2)
batch_loss = -torch.mean(1e-8+torch.log(torch.sigmoid(torch.matmul(output, self.weight))))
if torch.isnan(batch_loss):
from logging import getLogger
import sys
import random
logger = getLogger()
logger.info(score)
logger.info(seq_output.dtype)
logger.info(target_output.dtype)
logger.info(output)
The print I got Both the seq_output and score will become "Nan" (the target_output is not nan)
However, when I try to debug the code, I feel strange that if I print the result in advance, I will not get "Nan". Also, if I debug the code, when I step into the "self.seq_modeling()" function line by line, I will not get "Nan" and it trains well; when I just step out, I will find "seq_output" become nan after the "self.seq_modeling()" function and I am very confused about this.
For example, when I put logging in the code:
def seq_modeling(self, alias_inputs, A, hidden, mask):
hidden = self.gnn(A, hidden)
logger.info(torch.isnan(hidden).any()) #add log here
get = lambda i: hidden[i][alias_inputs[i]]
seq_hidden = torch.stack([get(i) for i in torch.arange(len(alias_inputs)).long()])
ht = seq_hidden[torch.arange(mask.shape[0]).long(), torch.sum(mask, 1) - 1]
q1 = self.linear_one(ht).view(ht.shape[0], 1, ht.shape[1]) # batch_size x 1 x latent_size
q2 = self.linear_two(seq_hidden) # batch_size x seq_length x latent_size
alpha = self.linear_three(torch.sigmoid(q1 + q2))
a = torch.sum(alpha * seq_hidden * mask.view(mask.shape[0], -1, 1).float(), 1)
a = self.linear_transform(torch.cat([a, ht], 1))
logger.info(torch.isnan(a).any()) #add log here
return a
def forward(self, input):
alias_inputs, A, items, mask, targets = input
hidden = self.embedding(items)
seq_output = self.seq_modeling(alias_inputs, A, hidden, mask).unsqueeze(1) #[batch,1, dim]
logger.info(seq_output) #add log here
logger.info(seq_output.dtype) #add log here
target_output =self.embedding(targets) #[batch,2, dim]
score = (seq_output * target_output).sum(-1)
logger.info(score) #add log here -> not "Nan"
output = score.view(-1,2)
batch_loss = -torch.mean(1e-8+torch.log(torch.sigmoid(torch.matmul(output, self.weight))))
return batch_loss
I will get this and the nan just disappear and the training goes well! I wonder why this happen because I just add some logging sentense