I have the following trimmed down model:
import torch.nn as nn
import torch
import argparse
import torch
import torch.utils.data
from torch import nn, optim
from torch.autograd import Variable
from torch.nn import functional as F
from torchvision import datasets, transforms
from torchvision.utils import save_image
import json
import numpy as np
import datetime
import os
class EncoderRNN(nn.Module):
def __init__(self, input_size=8, hidden_size=10, num_layers=2):
super(EncoderRNN, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
self.relu = nn.ReLU()
self.sigmoid = nn.Sigmoid()
#initialize weights
nn.init.xavier_uniform(self.lstm.weight_ih_l0, gain=np.sqrt(2))
nn.init.xavier_uniform(self.lstm.weight_hh_l0, gain=np.sqrt(2))
def forward(self, input):
tt = torch
print input.shape
h0 = Variable(tt.FloatTensor(self.num_layers, input.size(0), self.hidden_size))
c0 = Variable(tt.FloatTensor(self.num_layers, input.size(0), self.hidden_size))
encoded_input, hidden = self.lstm(input, (h0, c0))
encoded_input = self.sigmoid(encoded_input)
return encoded_input
train_x = torch.from_numpy(np.random.random((2000,19,8))).float()
train_loader = torch.utils.data.DataLoader(train_x,
batch_size=64, shuffle=True)
model = EncoderRNN()
optimizer = optim.Adam(model.parameters(), lr=1e-6)
optimizer.zero_grad()
loss_function = torch.nn.BCELoss(reduce=True)
def train(epoch):
model.train()
train_loss = 0
for batch_idx, (data_x) in enumerate(train_loader):
x = model(Variable(data_x))
print("output has nan: " + str(np.isnan(x.detach().numpy()).any()))
train(0)
To summarize, I think I'm basically just feeding an input into an LSTM with random initialized hidden values, and then taking the sigmoid of the output of that LSTM. Then I'm feeding that output to a decoder LSTM, and taking the sigmoid of the output of the decoder output and using that as my final value.
Unfortunately, even in the first iteration, the model often outputs a vector of the correct shape (batch_size, seq_length, seq_dim), but contains at least one and sometimes all NaN values. What am I doing wrong?
Thanks!
What I've tried so far:
- changing LSTM to GRU
- changing sigmoid to relu
- changing the dimension of the hidden representation
- getting the failing input down to the encoder
Edit: Apologies to everyone who tried to help when I had broken code - I really do value your time, and thanks so much for helping!