I am working on this question answering code and using pretrained GPT2LMHeadModel. But after tokenization when I pass the inputs and attention mask to the model it is giving index error. My code:
feedback_dataset = []
# Preprocessing
nltk.download("stopwords")
nltk.download("wordnet")
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))
def preprocess_text(text):
# Lowercase
text = text.lower()
# Remove punctuation
text = text.translate(str.maketrans("", "", string.punctuation))
# Remove numbers
text = re.sub(r"\d+", "", text)
# Tokenization
tokens = text.split()
# Remove stop words
tokens = [token for token in tokens if token not in stop_words]
# Lemmatization
tokens = [lemmatizer.lemmatize(token) for token in tokens]
# Join tokens
text = " ".join(tokens)
return text
# Preprocess the dataset
preprocessed_dataset = [
{
"user": preprocess_text(entry["user"]),
"bot": preprocess_text(entry["bot"])
}
for entry in dataset
]
# Load pre-trained model and tokenizer
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# Add padding token to the tokenizer
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# Define the maximum sequence length
max_length = 512 # Set your desired maximum length here
# Tokenize and format the dataset with truncation
tokenized_dataset = tokenizer.batch_encode_plus(
[(entry["user"], entry["bot"]) for entry in preprocessed_dataset],
padding="longest",
truncation=True,
max_length=max_length,
return_tensors="pt"
)
input_ids = tokenized_dataset["input_ids"]
attention_mask = tokenized_dataset["attention_mask"]
# Ensure input tensors have correct shape
input_ids = input_ids.squeeze()
attention_mask = attention_mask.squeeze()
# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()
# Training loop
num_epochs = 2
for epoch in range(num_epochs):
optimizer.zero_grad()
inputs = {
"input_ids": input_ids,
"attention_mask": attention_mask,
"labels": input_ids
}
print("input_ids shape: ", input_ids.shape,"attention_mask shape: ", attention_mask.shape)#, "input shape: ", inputs)
outputs = model(**inputs)
loss = outputs.loss
loss.backward()
optimizer.step()
I am getting error in the outpus = model(**inputs)
line.
The error is:
input_ids shape: torch.Size([5, 19]) attention_mask shape: torch.Size([5, 19])
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-39-3329f43b161a> in <cell line: 7>()
14 print("input_ids shape: ", input_ids.shape,"attention_mask shape: ", attention_mask.shape)#, "input shape: ", inputs)
15
---> 16 outputs = model(**inputs)
17 loss = outputs.loss
18 loss.backward()
6 frames
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
/usr/local/lib/python3.10/dist-packages/transformers/models/gpt2/modeling_gpt2.py in forward(self, input_ids, past_key_values, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, labels, use_cache, output_attentions, output_hidden_states, return_dict)
1074 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1075
-> 1076 transformer_outputs = self.transformer(
1077 input_ids,
1078 past_key_values=past_key_values,
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
/usr/local/lib/python3.10/dist-packages/transformers/models/gpt2/modeling_gpt2.py in forward(self, input_ids, past_key_values, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, use_cache, output_attentions, output_hidden_states, return_dict)
841
842 if inputs_embeds is None:
--> 843 inputs_embeds = self.wte(input_ids)
844 position_embeds = self.wpe(position_ids)
845 hidden_states = inputs_embeds + position_embeds
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/sparse.py in forward(self, input)
160
161 def forward(self, input: Tensor) -> Tensor:
--> 162 return F.embedding(
163 input, self.weight, self.padding_idx, self.max_norm,
164 self.norm_type, self.scale_grad_by_freq, self.sparse)
/usr/local/lib/python3.10/dist-packages/torch/nn/functional.py in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
2208 # remove once script supports set_grad_enabled
2209 _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 2210 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
2211
2212
IndexError: index out of range in self
the size of input and attention mask is same. And it also the shape of token is also less than 1024 which is max for gpt2. So what could be the problem? Can anyone help me please.