I'm building a classifier for a QA bot, and have a dataset for 8k questions, and 149 different Answers.
I got some problems when training my model; the "loss" won't go down as I expected so I am asking for your help...
Here is my method:
I use word2vec to get a word's vector, then use a GRU-based network to get the vector of sentence The w2v model has been trained with wiki data, and works well on another of my NLP projects. The GRU code is from my senior, I think it works well, too.
# Part of the code for getting sentence vector
input_size = 400
hidden_dim = 400
num_layers = 1
gru = nn.GRU(input_size, hidden_dim,num_layers,batch_first = True)
h0 = torch.rand(num_layers, 7187, hidden_dim) # (num_layers, batch, hidden_dim)
# shape of input [dataset_len,max_sentence_len,input_feature]
inputSet = torch.tensor(x_train,dtype = torch.float)
sentenceVecs, hidden = gru(inputSet,h0)
sentenceVecs = sentenceVecs[:,-1, :]
and here is my classifier model
from argparse import Namespace
args = Namespace(
dataset_file = 'dataset/waimai_10k_tw.pkl',
model_save_path='torchmodel/pytorch_bce.model',
# Training hyper parameters
batch_size = 100,
learning_rate = 0.002,
min_learning_rate = 0.002,
num_epochs=200,
)
class JWP(nn.Module):
def __init__(self,
n_feature,
n_hidden,
n_hidden2,
n_hidden3,
n_output):
super(JWP, self).__init__()
self.hidden = nn.Linear(n_feature, n_hidden)
self.hidden2 = nn.Linear(n_hidden, n_hidden2)
self.hidden3 = nn.Linear(n_hidden2, n_hidden3)
self.out = nn.Linear(n_hidden3, n_output)
def forward(self, x, apply_softmax=False):
x = F.relu(self.hidden(x).squeeze())
x = F.relu(self.hidden2(x).squeeze())
x = F.relu(self.hidden3(x).squeeze())
#
if(apply_softmax):
x = torch.softmax(self.out(x))
else:
x = self.out(x)
return x
training code
lr = args.learning_rate
min_lr = args.min_learning_rate
def adjust_learning_rate(optimizer, epoch):
global lr
if epoch % 10 == 0 and epoch != 0:
lr = lr * 0.65
if(lr < min_lr):
lr = min_lr
for param_group in optimizer.param_groups:
param_group['lr'] = lr
if __name__ == "__main__":
EPOCH = args.num_epochs
net = JWP(400,325,275,225,149)
# net = JWP(400,250,149)
# net = JWP(400,149)
print(net)
optimizer = torch.optim.SGD(net.parameters(), lr=lr)
loss_func = torch.nn.CrossEntropyLoss()
for t in range(EPOCH):
adjust_learning_rate(optimizer,t)
"""
Train phase
"""
net.train()
TrainLoss = 0.0
# Train batch
for step,(batchData, batchTarget) in enumerate(trainDataLoader):
optimizer.zero_grad()
out = net(batchData)
loss = loss_func(out,batchTarget)
TrainLoss = TrainLoss + loss
loss.backward()
optimizer.step()
TrainLoss = TrainLoss / (step+1) # epoch loss
"""
Result
"""
print(
"epoch:",t+1 ,
"train_loss:",round(TrainLoss.item(),3),
"LR:",lr
)
Is it that my model is too simple or do I simply use the wrong method? The loss is always stuck at around 4.6 and I can't lower it any more...
epoch: 2898 train_loss: 4.643 LR: 0.002
epoch: 2899 train_loss: 4.643 LR: 0.002
epoch: 2900 train_loss: 4.643 LR: 0.002
epoch: 2901 train_loss: 4.643 LR: 0.002