I am trying to make an model that is trained on custom data to eventually make a chatbot from it. The problem is training the model resolves in loss as nan.
I am trying to to train the model and see some sort of loss but this loop breaks and does not train the model. I want to get an answer from the model but it returns nothing
this is the code:
# Preprocess your dataset
questions = ["What is the capital of France", "Who invented the telephone"]
answers = ["Paris", "Alexander Graham Bell"]
encoded_inputs = tokenizer.batch_encode_plus(
list(zip(questions, answers)),
padding=True,
truncation=True,
max_length=256,
return_tensors='pt'
)
# Fine-tune the BERT model
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-6)
input_ids = encoded_inputs['input_ids']
attention_mask = encoded_inputs['attention_mask']
start_positions = torch.tensor([tokenizer.encode(answer, add_special_tokens=False)[0] for answer in answers])
end_positions = torch.tensor([tokenizer.encode(answer, add_special_tokens=False)[-1] for answer in answers])
model.train()
for epoch in range(5):
optimizer.zero_grad()
outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
loss = outputs.loss
# Check for NaN loss
if torch.isnan(loss):
print(f"NaN loss encountered at epoch {epoch+1}. Training stopped.")
break
loss.backward()
optimizer.step()
print(f"Epoch {epoch+1} Loss: {loss.item()}")
# Check for loss explosion
if loss.item() > 1e5:
print(f"Loss exploded at epoch {epoch+1}. Training stopped.")
break
# Test the model on the training questions
test_question = "What is the capital of France?"
encoded_test = tokenizer(test_question, padding=True, truncation=True, return_tensors='pt')
test_input_ids = encoded_test['input_ids']
test_attention_mask = encoded_test['attention_mask']
model.eval()
with torch.no_grad():
outputs = model(test_input_ids, attention_mask=test_attention_mask)
start_scores = outputs.start_logits
end_scores = outputs.end_logits
# Get the predicted start and end indices
predicted_start = torch.argmax(start_scores)
predicted_end = torch.argmax(end_scores)
# Decode the predicted answer
predicted_answer = tokenizer.decode(test_input_ids[0][predicted_start:predicted_end+1])
print(f"Predicted Answer: {predicted_answer}")