in my Jupiter i have the following code. I can not figure out why this throws a IndexError: index out of range in self
error.
here ist the code:
!pip install torch
!pip install torchvision
!pip install transformers
import torch
from torch.utils.data import Dataset
class MakeDataset(Dataset):
def __init__(self, tokenized_texts, block_size):
self.examples = []
for tokens in tokenized_texts:
# truncate the tokens if they are longer than block_size
if len(tokens) > block_size:
tokens = tokens[:block_size]
# add padding tokens if the tokens are shorter than block_size
while len(tokens) < block_size:
tokens.append(tokenizer.pad_token_id)
self.examples.append(torch.tensor(tokens, dtype=torch.long))
def __len__(self):
return len(self.examples)
def __getitem__(self, item):
return self.examples[item]
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments, AutoTokenizer, \
AutoModelWithLMHead, GPT2Tokenizer
# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', padding_side='right')
model = AutoModelWithLMHead.from_pretrained('gpt2')
PAD_TOKEN = '<PAD>'
tokenizer.add_special_tokens({'pad_token': PAD_TOKEN})
# Load text corpus
with open("texts.txt", encoding="utf-8") as f:
texts = f.read().splitlines()
print(len(texts) , " lines of text.")
# Tokenize the texts
tokenized_texts = []
for text in texts:
tokens = tokenizer.encode(text, padding='max_length', truncation='only_first')
if len(tokens) > 0:
tokenized_texts.append(tokens)
# gemerate a dataset
dataset = MakeDataset(tokenized_texts, block_size=1024)
print("Dataset length: ", len(dataset))
# Create a DataCollatorForLanguageModeling object
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
# Define the training arguments
training_args = TrainingArguments(
output_dir='./results', # output directory
num_train_epochs=5, # total number of training epochs
per_device_train_batch_size=16, # batch size per device during training
save_steps=1000, # number of steps between saving checkpoints
save_total_limit=2, # limit the total amount of checkpoints saved
prediction_loss_only=True, # only calculate loss on prediction tokens
learning_rate=1e-5, # learning rate
warmup_steps=500, # number of warmup steps for learning rate scheduler
fp16=False # enable mixed precision training with apex
)
# Create a Trainer object
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=dataset
)
# Train the model
trainer.train()
# Save the trained model
trainer.save_model('./fine-tuned-gpt2')
The text file at the moment looks very simple:
Hello, my name is Paul.
My cat can sing.
The full error is:
IndexError Traceback (most recent call last)
Cell In[140], line 54
46 trainer = Trainer(
47 model=model,
48 args=training_args,
49 data_collator=data_collator,
50 train_dataset=dataset
51 )
53 # Train the model
---> 54 trainer.train()
56 # Save the trained model
57 trainer.save_model('./fine-tuned-gpt2')
File /opt/homebrew/lib/python3.11/site-packages/transformers/trainer.py:1633, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1628 self.model_wrapped = self.model
1630 inner_training_loop = find_executable_batch_size(
1631 self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size
1632 )
-> 1633 return inner_training_loop(
1634 args=args,
1635 resume_from_checkpoint=resume_from_checkpoint,
1636 trial=trial,
1637 ignore_keys_for_eval=ignore_keys_for_eval,
1638 )
File /opt/homebrew/lib/python3.11/site-packages/transformers/trainer.py:1902, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
1900 tr_loss_step = self.training_step(model, inputs)
1901 else:
-> 1902 tr_loss_step = self.training_step(model, inputs)
1904 if (
1905 args.logging_nan_inf_filter
1906 and not is_torch_tpu_available()
1907 and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
1908 ):
1909 # if loss is nan or inf simply add the average of previous logged losses
1910 tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)
File /opt/homebrew/lib/python3.11/site-packages/transformers/trainer.py:2645, in Trainer.training_step(self, model, inputs)
2642 return loss_mb.reduce_mean().detach().to(self.args.device)
2644 with self.compute_loss_context_manager():
-> 2645 loss = self.compute_loss(model, inputs)
2647 if self.args.n_gpu > 1:
2648 loss = loss.mean() # mean() to average on multi-gpu parallel training
File /opt/homebrew/lib/python3.11/site-packages/transformers/trainer.py:2677, in Trainer.compute_loss(self, model, inputs, return_outputs)
2675 else:
2676 labels = None
-> 2677 outputs = model(**inputs)
2678 # Save past state if it exists
2679 # TODO: this needs to be fixed and made cleaner later.
2680 if self.args.past_index >= 0:
File /opt/homebrew/lib/python3.11/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
File /opt/homebrew/lib/python3.11/site-packages/transformers/models/gpt2/modeling_gpt2.py:1075, in GPT2LMHeadModel.forward(self, input_ids, past_key_values, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, labels, use_cache, output_attentions, output_hidden_states, return_dict)
1067 r"""
1068 labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1069 Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
1070 `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
1071 are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
1072 """
1073 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-> 1075 transformer_outputs = self.transformer(
1076 input_ids,
1077 past_key_values=past_key_values,
1078 attention_mask=attention_mask,
1079 token_type_ids=token_type_ids,
1080 position_ids=position_ids,
1081 head_mask=head_mask,
1082 inputs_embeds=inputs_embeds,
1083 encoder_hidden_states=encoder_hidden_states,
1084 encoder_attention_mask=encoder_attention_mask,
1085 use_cache=use_cache,
1086 output_attentions=output_attentions,
1087 output_hidden_states=output_hidden_states,
1088 return_dict=return_dict,
1089 )
1090 hidden_states = transformer_outputs[0]
1092 # Set device for model parallelism
File /opt/homebrew/lib/python3.11/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
File /opt/homebrew/lib/python3.11/site-packages/transformers/models/gpt2/modeling_gpt2.py:842, in GPT2Model.forward(self, input_ids, past_key_values, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, use_cache, output_attentions, output_hidden_states, return_dict)
839 head_mask = self.get_head_mask(head_mask, self.config.n_layer)
841 if inputs_embeds is None:
--> 842 inputs_embeds = self.wte(input_ids)
843 position_embeds = self.wpe(position_ids)
844 hidden_states = inputs_embeds + position_embeds
File /opt/homebrew/lib/python3.11/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
File /opt/homebrew/lib/python3.11/site-packages/torch/nn/modules/sparse.py:162, in Embedding.forward(self, input)
161 def forward(self, input: Tensor) -> Tensor:
--> 162 return F.embedding(
163 input, self.weight, self.padding_idx, self.max_norm,
164 self.norm_type, self.scale_grad_by_freq, self.sparse)
File /opt/homebrew/lib/python3.11/site-packages/torch/nn/functional.py:2210, in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
2204 # Note [embedding_renorm set_grad_enabled]
2205 # XXX: equivalent to
2206 # with torch.no_grad():
2207 # torch.embedding_renorm_
2208 # remove once script supports set_grad_enabled
2209 _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 2210 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
IndexError: index out of range in self
Can someone tell me what I have done wrong with the training setup?
++ UPDATE ++
I change the MakeDataset
to TextDataset
to get a pt tensor back:
class TextDataset(torch.utils.data.Dataset):
def __init__(self, encodings):
self.encodings = encodings
def __getitem__(self, idx):
return {key: tensor[idx] for key, tensor in self.encodings.items()}
def __len__(self):
return len(self.encodings.input_ids)
the output of print(dataset[0])
is:
{'input_ids': tensor([15496, 11, 616, 1438, 318, 3362, 13, 50257, 50257, 50257,
50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
50257, 50257]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}
and with
tokenized_texts = tokenizer(texts, padding='max_length', truncation=True, return_tensors="pt")
to pad them to the models length:
6 lines of text.
Dataset length: 6
{'input_ids': tensor([[15496, 11, 616, ..., 50257, 50257, 50257],
[ 3666, 3797, 460, ..., 50257, 50257, 50257],
[32423, 1408, 46097, ..., 50257, 50257, 50257],
[10020, 1044, 6877, ..., 50257, 50257, 50257],
[31319, 288, 292, ..., 50257, 50257, 50257],
[ 7447, 24408, 8834, ..., 50257, 50257, 50257]]), 'attention_mask': tensor([[1, 1, 1, ..., 0, 0, 0],
[1, 1, 1, ..., 0, 0, 0],
[1, 1, 1, ..., 0, 0, 0],
[1, 1, 1, ..., 0, 0, 0],
[1, 1, 1, ..., 0, 0, 0],
[1, 1, 1, ..., 0, 0, 0]])}
But I still get the same error. I also deleted all caches.