I am trying to finetune Helsinki model with the Huggingface trainer based on the documentation found under link 1 and link 2 for a translation task from German to French. I am using cross validation for performance check. I am finetuning the model in aws Sagemaker notebook. I am noticing that the performance while training is working just fine but during evaluation the performance decreases dramatically from 2.7 iter/s (during training/finetuning) to 0.07 iter/s (during validation).
This is the code I am trying to finetune with:
#checkpoint
checkpoint = "Helsinki-NLP/opus-mt-de-fr"
source_lang = "de"
target_lang = "fr"
prefix = "Übersetzen Deutsch ins Französisch: "
#dataset
dataset_opus100 = load_dataset("opus100", "de-fr", split="test")
final_corpus_dataset= dataset_opus100
# preprocess
def preprocess_function(examples):
inputs = [prefix + example[source_lang] for example in examples["translation"]]
targets = [example[target_lang] for example in examples["translation"]]
model_inputs = tokenizer(
inputs, text_target=targets, max_length=128, truncation=True
)
return model_inputs
# metric
def postprocess_text(preds, labels):
preds = [pred.strip() for pred in preds]
labels = [[label.strip()] for label in labels]
return preds, labels
def compute_metrics(eval_preds):
preds, labels = eval_preds
if isinstance(preds, tuple):
preds = preds[0]
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
result = metric.compute(predictions=decoded_preds, references=decoded_labels)
result = {"bleu": result["score"]}
prediction_lens = [
np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
]
result["gen_len"] = np.mean(prediction_lens)
result = {k: round(v, 4) for k, v in result.items()}
# print(result)
return result
# config_params
exp_name = "de-fr-helsinki"
seed = 42
kfold = KFold(n_splits=5, shuffle=True, random_state=seed)
# K-fold Cross Validation model evaluation
for fold, (train_ids, val_ids) in enumerate(kfold.split(final_corpus_dataset)):
# if fold==0 or fold==1:
# continue
# metric
metric = evaluate.load("sacrebleu")
# arguments for training
training_args = Seq2SeqTrainingArguments(
output_dir=f"test/huggingface_exps/{exp_name}/fold{fold}",
evaluation_strategy="epoch",
logging_strategy="epoch",
# logging_steps=4,
learning_rate=2e-5,
per_device_train_batch_size=64,
per_device_eval_batch_size=64,
weight_decay=0.01,
num_train_epochs=10,
save_total_limit=2,
save_strategy="epoch",
load_best_model_at_end=True,
predict_with_generate=True,
fp16=False,
push_to_hub=False,
# tensorboard log directory
logging_dir=f"test/huggingface_exps/{exp_name}/fold{fold}/runs",
report_to=["tensorboard"],
# dataloader_num_workers=2,
)
print("fold", fold)
# reset tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
# select splits
train_dataset = final_corpus_dataset.select(train_ids)
eval_dataset = final_corpus_dataset.select(val_ids)
# preprocess
train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset = eval_dataset.map(preprocess_function, batched=True)
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
# train_dataset=Dataset.from_dict(train_dataset[:40]),
# eval_dataset=Dataset.from_dict(eval_dataset[:20]),
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
# callbacks=[TensorBoardCallback]
callbacks=[
CombinedTensorBoardCallback,
EarlyStoppingCallback(early_stopping_patience=3),
],
)
train_result = trainer.train()
# compute train results
metrics = train_result.metrics
# print metrics history
import os
# Create the directory if it doesn't exist
os.makedirs(f"test/huggingface_exps/{exp_name}/fold{fold}/", exist_ok=True)
with open(f"test/huggingface_exps/{exp_name}/fold{fold}/console.json", "w") as f:
json.dump(trainer.state.log_history, f)
#next
Things I considered:
- GPU memory is enough to fit the process (14.5GB/15.5GB)
- Reducing the batch size for the evaluation process didn't help
- While training on sagemmaker I get this error
valueerror: This tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed in order to use this tokenizer
in comparaison to the training process withmt5-small
. I had to install sentencepiece with! pip install transformers[sentencepiece]
. I'm not sure if it reduces the performance during the evaluation process ( nothing regarding this was mentioned in the documentation)
Packages I installed in this order:
! pip install transformers==4.28.0
! pip install datasets
! pip install evaluate
! pip install torch
! pip install sklearn
! pip install tensorboard
! pip install sacrebleu
! pip install accelerate -U
! pip install transformers[sentencepiece]