0

I am working on this question answering code and using pretrained GPT2LMHeadModel. But after tokenization when I pass the inputs and attention mask to the model it is giving index error. My code:

feedback_dataset = []

# Preprocessing
nltk.download("stopwords")
nltk.download("wordnet")

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    # Remove numbers
    text = re.sub(r"\d+", "", text)
    
    # Tokenization
    tokens = text.split()
    
    # Remove stop words
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatization
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join tokens
    text = " ".join(tokens)
    
    return text

# Preprocess the dataset
preprocessed_dataset = [
    {
        "user": preprocess_text(entry["user"]),
        "bot": preprocess_text(entry["bot"])
    }
    for entry in dataset
]

# Load pre-trained model and tokenizer
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# Add padding token to the tokenizer
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Define the maximum sequence length
max_length = 512  # Set your desired maximum length here

# Tokenize and format the dataset with truncation
tokenized_dataset = tokenizer.batch_encode_plus(
    [(entry["user"], entry["bot"]) for entry in preprocessed_dataset],
    padding="longest",
    truncation=True,
    max_length=max_length,
    return_tensors="pt"
)

input_ids = tokenized_dataset["input_ids"]
attention_mask = tokenized_dataset["attention_mask"]

# Ensure input tensors have correct shape
input_ids = input_ids.squeeze()
attention_mask = attention_mask.squeeze()
# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Training loop
num_epochs = 2
for epoch in range(num_epochs):
    optimizer.zero_grad()
    inputs = {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": input_ids
    }
    print("input_ids shape: ", input_ids.shape,"attention_mask shape: ", attention_mask.shape)#, "input shape: ", inputs)
    
    outputs = model(**inputs)
    loss = outputs.loss
    loss.backward()
    optimizer.step()

I am getting error in the outpus = model(**inputs) line. The error is:

input_ids shape:  torch.Size([5, 19]) attention_mask shape:  torch.Size([5, 19])
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-39-3329f43b161a> in <cell line: 7>()
     14     print("input_ids shape: ", input_ids.shape,"attention_mask shape: ", attention_mask.shape)#, "input shape: ", inputs)
     15 
---> 16     outputs = model(**inputs)
     17     loss = outputs.loss
     18     loss.backward()

6 frames
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
   1499                 or _global_backward_pre_hooks or _global_backward_hooks
   1500                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501             return forward_call(*args, **kwargs)
   1502         # Do not call functions when jit is used
   1503         full_backward_hooks, non_full_backward_hooks = [], []

/usr/local/lib/python3.10/dist-packages/transformers/models/gpt2/modeling_gpt2.py in forward(self, input_ids, past_key_values, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, labels, use_cache, output_attentions, output_hidden_states, return_dict)
   1074         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
   1075 
-> 1076         transformer_outputs = self.transformer(
   1077             input_ids,
   1078             past_key_values=past_key_values,

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
   1499                 or _global_backward_pre_hooks or _global_backward_hooks
   1500                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501             return forward_call(*args, **kwargs)
   1502         # Do not call functions when jit is used
   1503         full_backward_hooks, non_full_backward_hooks = [], []

/usr/local/lib/python3.10/dist-packages/transformers/models/gpt2/modeling_gpt2.py in forward(self, input_ids, past_key_values, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, use_cache, output_attentions, output_hidden_states, return_dict)
    841 
    842         if inputs_embeds is None:
--> 843             inputs_embeds = self.wte(input_ids)
    844         position_embeds = self.wpe(position_ids)
    845         hidden_states = inputs_embeds + position_embeds

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
   1499                 or _global_backward_pre_hooks or _global_backward_hooks
   1500                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501             return forward_call(*args, **kwargs)
   1502         # Do not call functions when jit is used
   1503         full_backward_hooks, non_full_backward_hooks = [], []

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/sparse.py in forward(self, input)
    160 
    161     def forward(self, input: Tensor) -> Tensor:
--> 162         return F.embedding(
    163             input, self.weight, self.padding_idx, self.max_norm,
    164             self.norm_type, self.scale_grad_by_freq, self.sparse)

/usr/local/lib/python3.10/dist-packages/torch/nn/functional.py in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
   2208         # remove once script supports set_grad_enabled
   2209         _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 2210     return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
   2211 
   2212 

IndexError: index out of range in self

the size of input and attention mask is same. And it also the shape of token is also less than 1024 which is max for gpt2. So what could be the problem? Can anyone help me please.

1 Answers1

0

Remove it: tokenizer.add_special_tokens({'pad_token': '[PAD]'}). And replace it with: tokenizer.pad_token = tokenizer.eos_token. These two lines of code give different results because the EOS index values are different: 50257 and 50256. That is why we are getting out of the index.