Hello there I'm attempting to train a GPT 2 model how to summarize passages without compromising their emotional impact. Consider summarizing a chapter from a book, but we want the reader to experience the same emotions as the chapter itself. I discovered a Kaggle dataset that includes Amazon's fine food reviews (/kaggle/input/amazon-fine-food-reviews/Reviews.csv
).
First I extracted features from the dataset feedback such as processed_text (feedback without stop words), sentiments, emotions using bert and T5 models. Then I tokenized these features and relevent summary as follows
!pip install transformers torch
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# Preprocess sentiment values to extract sentiment labels
data['sentiment'] = data['sentiment'].apply(lambda sentiment: sentiment[0]['label']) # Extracting the sentiment label from the dictionary
# Tokenize the processed_text, sentiment, and emotion
data['processed_text_tokenized'] = data['processed_text'].apply(lambda text: tokenizer.encode(text, truncation=True, padding='max_length', max_length=256))
data['sentiment_tokenized'] = data['sentiment'].apply(lambda sentiment: tokenizer.encode(sentiment, truncation=True, padding='max_length', max_length=4))
data['emotion_tokenized'] = data['emotions'].apply(lambda emotion: tokenizer.encode(emotion, truncation=True, padding='max_length', max_length=4))
# Tokenize the summaries
data['summary_tokenized'] = data['Summary'].apply(lambda summary: tokenizer.encode(summary, truncation=True, padding='max_length', max_length=128))
Next I created the relevant dataset and the dataloader as follows:
import torch
from torch.utils.data import Dataset, DataLoader
class EmotionAwareSummaryDataset(Dataset):
def __init__(self, processed_text, sentiment, emotion, summary):
self.processed_text = processed_text
self.sentiment = sentiment
self.emotion = emotion
self.summary = summary
def __len__(self):
return len(self.processed_text)
def __getitem__(self, idx):
input_ids = self.processed_text[idx] + self.sentiment[idx] + self.emotion[idx]
attention_mask = torch.ones(len(input_ids))
decoder_input_ids = self.summary[idx]
decoder_attention_mask = torch.ones(len(decoder_input_ids))
# Calculate the loss
labels = torch.tensor(decoder_input_ids).clone()
labels[labels == tokenizer.pad_token_id] = -100 # Ignore padding tokens for loss calculation
return {
"input_ids": input_ids,
"attention_mask": attention_mask,
"decoder_input_ids": decoder_input_ids,
"decoder_attention_mask": decoder_attention_mask,
"labels": labels # Add this line to calculate the loss
}
# Create datasets and dataloaders
train_dataset = EmotionAwareSummaryDataset(
processed_text=data['processed_text_tokenized'].tolist(),
sentiment=data['sentiment_tokenized'].tolist(),
emotion=data['emotion_tokenized'].tolist(),
summary=data['summary_tokenized'].tolist()
)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
Finally I finetuned the GPT 2 model as follows:
from transformers import GPT2LMHeadModel, GPT2Config, Trainer, TrainingArguments
# Load GPT-2 model configuration
config = GPT2Config.from_pretrained("gpt2", output_hidden_states=True)
# Load GPT-2 model and add a linear layer for summarization
model = GPT2LMHeadModel.from_pretrained("gpt2", config=config)
model.resize_token_embeddings(len(tokenizer))
model.train()
output = "./emotion_aware_summary"
# Define the training arguments
training_args = TrainingArguments(
output_dir= output,
overwrite_output_dir=True,
num_train_epochs=5,
per_device_train_batch_size=4,
save_steps=500,
save_total_limit=2,
)
# Define the trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset
)
# Start fine-tuning
trainer.train()
model.save_pretrained(output)
tokenizer.save_pretrained(output)
But when I run the notebook I'm getting the following error:
/opt/conda/lib/python3.10/site-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
warnings.warn(
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ in <module>:29 │
│ │
│ 26 ) │
│ 27 │
│ 28 # Start fine-tuning │
│ ❱ 29 trainer.train() │
│ 30 model.save_pretrained(output) │
│ 31 tokenizer.save_pretrained(output) │
│ 32 │
│ │
│ /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1645 in train │
│ │
│ 1642 │ │ inner_training_loop = find_executable_batch_size( │
│ 1643 │ │ │ self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size │
│ 1644 │ │ ) │
│ ❱ 1645 │ │ return inner_training_loop( │
│ 1646 │ │ │ args=args, │
│ 1647 │ │ │ resume_from_checkpoint=resume_from_checkpoint, │
│ 1648 │ │ │ trial=trial, │
│ │
│ /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1938 in _inner_training_loop │
│ │
│ 1935 │ │ │ │ │ self.control = self.callback_handler.on_step_begin(args, self.state, │
│ 1936 │ │ │ │ │
│ 1937 │ │ │ │ with self.accelerator.accumulate(model): │
│ ❱ 1938 │ │ │ │ │ tr_loss_step = self.training_step(model, inputs) │
│ 1939 │ │ │ │ │
│ 1940 │ │ │ │ if ( │
│ 1941 │ │ │ │ │ args.logging_nan_inf_filter │
│ │
│ /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:2759 in training_step │
│ │
│ 2756 │ │ │ return loss_mb.reduce_mean().detach().to(self.args.device) │
│ 2757 │ │ │
│ 2758 │ │ with self.compute_loss_context_manager(): │
│ ❱ 2759 │ │ │ loss = self.compute_loss(model, inputs) │
│ 2760 │ │ │
│ 2761 │ │ if self.args.n_gpu > 1: │
│ 2762 │ │ │ loss = loss.mean() # mean() to average on multi-gpu parallel training │
│ │
│ /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:2784 in compute_loss │
│ │
│ 2781 │ │ │ labels = inputs.pop("labels") │
│ 2782 │ │ else: │
│ 2783 │ │ │ labels = None │
│ ❱ 2784 │ │ outputs = model(**inputs) │
│ 2785 │ │ # Save past state if it exists │
│ 2786 │ │ # TODO: this needs to be fixed and made cleaner later. │
│ 2787 │ │ if self.args.past_index >= 0: │
│ │
│ /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1501 in _call_impl │
│ │
│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │
│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │
│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1501 │ │ │ return forward_call(*args, **kwargs) │
│ 1502 │ │ # Do not call functions when jit is used │
│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1504 │ │ backward_pre_hooks = [] │
│ │
│ /opt/conda/lib/python3.10/site-packages/transformers/models/gpt2/modeling_gpt2.py:1113 in │
│ forward │
│ │
│ 1110 │ │ │ shift_labels = labels[..., 1:].contiguous() │
│ 1111 │ │ │ # Flatten the tokens │
│ 1112 │ │ │ loss_fct = CrossEntropyLoss() │
│ ❱ 1113 │ │ │ loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.v │
│ 1114 │ │ │
│ 1115 │ │ if not return_dict: │
│ 1116 │ │ │ output = (lm_logits,) + transformer_outputs[1:] │
│ │
│ /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1501 in _call_impl │
│ │
│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │
│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │
│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1501 │ │ │ return forward_call(*args, **kwargs) │
│ 1502 │ │ # Do not call functions when jit is used │
│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1504 │ │ backward_pre_hooks = [] │
│ │
│ /opt/conda/lib/python3.10/site-packages/torch/nn/modules/loss.py:1174 in forward │
│ │
│ 1171 │ │ self.label_smoothing = label_smoothing │
│ 1172 │ │
│ 1173 │ def forward(self, input: Tensor, target: Tensor) -> Tensor: │
│ ❱ 1174 │ │ return F.cross_entropy(input, target, weight=self.weight, │
│ 1175 │ │ │ │ │ │ │ ignore_index=self.ignore_index, reduction=self.reduction, │
│ 1176 │ │ │ │ │ │ │ label_smoothing=self.label_smoothing) │
│ 1177 │
│ │
│ /opt/conda/lib/python3.10/site-packages/torch/nn/functional.py:3029 in cross_entropy │
│ │
│ 3026 │ │ ) │
│ 3027 │ if size_average is not None or reduce is not None: │
│ 3028 │ │ reduction = _Reduction.legacy_get_string(size_average, reduce) │
│ ❱ 3029 │ return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(re │
│ 3030 │
│ 3031 │
│ 3032 def binary_cross_entropy( │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
ValueError: Expected input batch_size (1052) to match target batch_size (508).
I feel there's an issue with the batch sizes of my inputs and targets do not match. But I can't find from where the issue triggers. If you can help me it'll be great! Furthermore if you have more suggestions rather than my method please suggest. Thanks in advance.