ValueError: Expected input batch_size (1052) to match target batch_size (508) when fine tuning GPT 2 model

Question

Hello there I'm attempting to train a GPT 2 model how to summarize passages without compromising their emotional impact. Consider summarizing a chapter from a book, but we want the reader to experience the same emotions as the chapter itself. I discovered a Kaggle dataset that includes Amazon's fine food reviews (/kaggle/input/amazon-fine-food-reviews/Reviews.csv). First I extracted features from the dataset feedback such as processed_text (feedback without stop words), sentiments, emotions using bert and T5 models. Then I tokenized these features and relevent summary as follows

!pip install transformers torch
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Preprocess sentiment values to extract sentiment labels
data['sentiment'] = data['sentiment'].apply(lambda sentiment: sentiment[0]['label'])  # Extracting the sentiment label from the dictionary

# Tokenize the processed_text, sentiment, and emotion
data['processed_text_tokenized'] = data['processed_text'].apply(lambda text: tokenizer.encode(text, truncation=True, padding='max_length', max_length=256))
data['sentiment_tokenized'] = data['sentiment'].apply(lambda sentiment: tokenizer.encode(sentiment, truncation=True, padding='max_length', max_length=4))
data['emotion_tokenized'] = data['emotions'].apply(lambda emotion: tokenizer.encode(emotion, truncation=True, padding='max_length', max_length=4))

# Tokenize the summaries
data['summary_tokenized'] = data['Summary'].apply(lambda summary: tokenizer.encode(summary, truncation=True, padding='max_length', max_length=128))

Next I created the relevant dataset and the dataloader as follows:

import torch
from torch.utils.data import Dataset, DataLoader

class EmotionAwareSummaryDataset(Dataset):
    def __init__(self, processed_text, sentiment, emotion, summary):
        self.processed_text = processed_text
        self.sentiment = sentiment
        self.emotion = emotion
        self.summary = summary

    def __len__(self):
        return len(self.processed_text)

    def __getitem__(self, idx):
        input_ids = self.processed_text[idx] + self.sentiment[idx] + self.emotion[idx]
        attention_mask = torch.ones(len(input_ids))
        decoder_input_ids = self.summary[idx]
        decoder_attention_mask = torch.ones(len(decoder_input_ids))

        # Calculate the loss
        labels = torch.tensor(decoder_input_ids).clone()
        labels[labels == tokenizer.pad_token_id] = -100  # Ignore padding tokens for loss calculation

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "decoder_input_ids": decoder_input_ids,
            "decoder_attention_mask": decoder_attention_mask,
            "labels": labels  # Add this line to calculate the loss
        }

# Create datasets and dataloaders
train_dataset = EmotionAwareSummaryDataset(
    processed_text=data['processed_text_tokenized'].tolist(),
    sentiment=data['sentiment_tokenized'].tolist(),
    emotion=data['emotion_tokenized'].tolist(),
    summary=data['summary_tokenized'].tolist()
)

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)

Finally I finetuned the GPT 2 model as follows:

from transformers import GPT2LMHeadModel, GPT2Config, Trainer, TrainingArguments

# Load GPT-2 model configuration
config = GPT2Config.from_pretrained("gpt2", output_hidden_states=True)

# Load GPT-2 model and add a linear layer for summarization
model = GPT2LMHeadModel.from_pretrained("gpt2", config=config)
model.resize_token_embeddings(len(tokenizer))
model.train()
output = "./emotion_aware_summary"
# Define the training arguments
training_args = TrainingArguments(
    output_dir= output,
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=4,
    save_steps=500,
    save_total_limit=2,
)

# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)

# Start fine-tuning
trainer.train()
model.save_pretrained(output)
tokenizer.save_pretrained(output)

But when I run the notebook I'm getting the following error:

/opt/conda/lib/python3.10/site-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
  warnings.warn(
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ in <module>:29                                                                                   │
│                                                                                                  │
│   26 )                                                                                           │
│   27                                                                                             │
│   28 # Start fine-tuning                                                                         │
│ ❱ 29 trainer.train()                                                                             │
│   30 model.save_pretrained(output)                                                               │
│   31 tokenizer.save_pretrained(output)                                                           │
│   32                                                                                             │
│                                                                                                  │
│ /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1645 in train                    │
│                                                                                                  │
│   1642 │   │   inner_training_loop = find_executable_batch_size(                                 │
│   1643 │   │   │   self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size  │
│   1644 │   │   )                                                                                 │
│ ❱ 1645 │   │   return inner_training_loop(                                                       │
│   1646 │   │   │   args=args,                                                                    │
│   1647 │   │   │   resume_from_checkpoint=resume_from_checkpoint,                                │
│   1648 │   │   │   trial=trial,                                                                  │
│                                                                                                  │
│ /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1938 in _inner_training_loop     │
│                                                                                                  │
│   1935 │   │   │   │   │   self.control = self.callback_handler.on_step_begin(args, self.state,  │
│   1936 │   │   │   │                                                                             │
│   1937 │   │   │   │   with self.accelerator.accumulate(model):                                  │
│ ❱ 1938 │   │   │   │   │   tr_loss_step = self.training_step(model, inputs)                      │
│   1939 │   │   │   │                                                                             │
│   1940 │   │   │   │   if (                                                                      │
│   1941 │   │   │   │   │   args.logging_nan_inf_filter                                           │
│                                                                                                  │
│ /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:2759 in training_step            │
│                                                                                                  │
│   2756 │   │   │   return loss_mb.reduce_mean().detach().to(self.args.device)                    │
│   2757 │   │                                                                                     │
│   2758 │   │   with self.compute_loss_context_manager():                                         │
│ ❱ 2759 │   │   │   loss = self.compute_loss(model, inputs)                                       │
│   2760 │   │                                                                                     │
│   2761 │   │   if self.args.n_gpu > 1:                                                           │
│   2762 │   │   │   loss = loss.mean()  # mean() to average on multi-gpu parallel training        │
│                                                                                                  │
│ /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:2784 in compute_loss             │
│                                                                                                  │
│   2781 │   │   │   labels = inputs.pop("labels")                                                 │
│   2782 │   │   else:                                                                             │
│   2783 │   │   │   labels = None                                                                 │
│ ❱ 2784 │   │   outputs = model(**inputs)                                                         │
│   2785 │   │   # Save past state if it exists                                                    │
│   2786 │   │   # TODO: this needs to be fixed and made cleaner later.                            │
│   2787 │   │   if self.args.past_index >= 0:                                                     │
│                                                                                                  │
│ /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1501 in _call_impl            │
│                                                                                                  │
│   1498 │   │   if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks   │
│   1499 │   │   │   │   or _global_backward_pre_hooks or _global_backward_hooks                   │
│   1500 │   │   │   │   or _global_forward_hooks or _global_forward_pre_hooks):                   │
│ ❱ 1501 │   │   │   return forward_call(*args, **kwargs)                                          │
│   1502 │   │   # Do not call functions when jit is used                                          │
│   1503 │   │   full_backward_hooks, non_full_backward_hooks = [], []                             │
│   1504 │   │   backward_pre_hooks = []                                                           │
│                                                                                                  │
│ /opt/conda/lib/python3.10/site-packages/transformers/models/gpt2/modeling_gpt2.py:1113 in        │
│ forward                                                                                          │
│                                                                                                  │
│   1110 │   │   │   shift_labels = labels[..., 1:].contiguous()                                   │
│   1111 │   │   │   # Flatten the tokens                                                          │
│   1112 │   │   │   loss_fct = CrossEntropyLoss()                                                 │
│ ❱ 1113 │   │   │   loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.v  │
│   1114 │   │                                                                                     │
│   1115 │   │   if not return_dict:                                                               │
│   1116 │   │   │   output = (lm_logits,) + transformer_outputs[1:]                               │
│                                                                                                  │
│ /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1501 in _call_impl            │
│                                                                                                  │
│   1498 │   │   if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks   │
│   1499 │   │   │   │   or _global_backward_pre_hooks or _global_backward_hooks                   │
│   1500 │   │   │   │   or _global_forward_hooks or _global_forward_pre_hooks):                   │
│ ❱ 1501 │   │   │   return forward_call(*args, **kwargs)                                          │
│   1502 │   │   # Do not call functions when jit is used                                          │
│   1503 │   │   full_backward_hooks, non_full_backward_hooks = [], []                             │
│   1504 │   │   backward_pre_hooks = []                                                           │
│                                                                                                  │
│ /opt/conda/lib/python3.10/site-packages/torch/nn/modules/loss.py:1174 in forward                 │
│                                                                                                  │
│   1171 │   │   self.label_smoothing = label_smoothing                                            │
│   1172 │                                                                                         │
│   1173 │   def forward(self, input: Tensor, target: Tensor) -> Tensor:                           │
│ ❱ 1174 │   │   return F.cross_entropy(input, target, weight=self.weight,                         │
│   1175 │   │   │   │   │   │   │      ignore_index=self.ignore_index, reduction=self.reduction,  │
│   1176 │   │   │   │   │   │   │      label_smoothing=self.label_smoothing)                      │
│   1177                                                                                           │
│                                                                                                  │
│ /opt/conda/lib/python3.10/site-packages/torch/nn/functional.py:3029 in cross_entropy             │
│                                                                                                  │
│   3026 │   │   )                                                                                 │
│   3027 │   if size_average is not None or reduce is not None:                                    │
│   3028 │   │   reduction = _Reduction.legacy_get_string(size_average, reduce)                    │
│ ❱ 3029 │   return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(re  │
│   3030                                                                                           │
│   3031                                                                                           │
│   3032 def binary_cross_entropy(                                                                 │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
ValueError: Expected input batch_size (1052) to match target batch_size (508).

I feel there's an issue with the batch sizes of my inputs and targets do not match. But I can't find from where the issue triggers. If you can help me it'll be great! Furthermore if you have more suggestions rather than my method please suggest. Thanks in advance.

ValueError: Expected input batch_size (1052) to match target batch_size (508) when fine tuning GPT 2 model

0 Answers0