I'm trying to train a linear regression model with pytorch and use the code from here. I made several changes to the original code: 1. change the datasize n_data
from 11 to 1e5, 2. decrease the learning rate to 1e-8, 3. to enforce the model trained on batches w/ fixed batch_size=128
.
class linearRegression(torch.nn.Module):
def __init__(self, inputSize, outputSize):
super(linearRegression, self).__init__()
self.linear = torch.nn.Linear(inputSize, outputSize)
def forward(self, x):
out = self.linear(x)
return out
def linear_demo():
# create dummy data for training
n_data = 100000
x_values = [i for i in range(n_data)]
x_train = np.array(x_values, dtype=np.float64)
x_train = x_train.reshape(-1, 1)
#y_values = [2*i + 1 + (np.random.rand(1)-0.5)*5 for i in x_values]
y_values = [2*i + 1 for i in x_values]
y_train = np.array(y_values, dtype=np.float64)
y_train = y_train.reshape(-1, 1)
inputDim = 1 # takes variable 'x'
outputDim = 1 # takes variable 'y'
learningRate = 1e-8
epochs = 10
batch_size = 128
model = linearRegression(inputDim, outputDim).to(torch.float64)
##### For GPU #######
if torch.cuda.is_available():
model.cuda()
criterion = torch.nn.MSELoss(reduction='mean').cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=learningRate)
for epoch in range(epochs):
permutation = torch.randperm(n_data)
idx = permutation[:batch_size]
# Converting inputs and labels to Variable
if torch.cuda.is_available():
inputs = Variable(torch.from_numpy(x_train[idx]).cuda())
labels = Variable(torch.from_numpy(y_train[idx]).cuda())
else:
inputs = Variable(torch.from_numpy(x_train[idx]))
labels = Variable(torch.from_numpy(y_train[idx]))
# Clear gradient buffers because we don't want any gradient from previous epoch to carry forward, dont want to cummulate gradients
optimizer.zero_grad()
# get output from the model, given the inputs
outputs = model(inputs)
# get loss for the predicted output
loss = criterion(outputs, labels)
if loss.item() > 100:
print(loss)
# get gradients w.r.t to parameters
loss.backward()
# update parameters
optimizer.step()
if (epoch % 10 == 0):
print('epoch {}, loss {}'.format(epoch, loss.item()))
return
However, the above code results in increasing training loss. enter image description here
I tried the following: 1. decrease n_data
to 1e4
, 2. decrease learningRate
to 1e-10
. Each alone can ensure the training loss decreasing.
However I don't understand why the datasize and learning rate together increases the training loss since the batch_size is fixed and the training block has no idea how large the dataset is. Can anyone explain why training loss depends on datasize given fixed batch size in this linear regression model?