1

I am trying to perform text classification using GPTNeo, using the tweet_eval dataset from huggingface. I am following this example https://huggingface.co/docs/transformers/tasks/sequence_classification, but there is some error. I am a beginner at LLMs and it will be very helpful if someone can help me solve the issue. Thanks in advance. This is my code:

from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
import datasets
import torch as t
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np

dataset = datasets.load_dataset("tweet_eval","emotion")

x_train = dataset["train"]["text"]
y_train = dataset["train"]["label"]

x_test = dataset["test"]["text"]
y_test = dataset["test"]["label"]

def load_LLM(llm, device):
    num_labels = 4
    id2label = {0: "Anger", 1: "Joy", 2: "Optimism", 3: "Sadness"}
    label2id = {"Anger": 0, "Joy": 1, "Optimism": 2, "Sadness":3}
    model = AutoModelForSequenceClassification.from_pretrained(llm,num_labels=num_labels,id2label=id2label, label2id=label2id)
    model.to(device)
    tokenizer = AutoTokenizer.from_pretrained(llm)
    return model, tokenizer

llm = "EleutherAI/gpt-neo-2.7B"
device = t.device('cuda' if t.cuda.is_available() else 'cpu')
model,tokenizer = load_LLM(llm,device)

tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = '[PAD]'
train_inputs = tokenizer(x_train, truncation=True, padding=True)
test_inputs = tokenizer(x_test, truncation=True, padding=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_inputs,
    eval_dataset=test_inputs,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

I am getting this error:

type here---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[18], line 1
----> 1 trainer.train()

File ~\anaconda3\envs\pt\lib\site-packages\transformers\trainer.py:1664, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
   1659     self.model_wrapped = self.model
   1661 inner_training_loop = find_executable_batch_size(
   1662     self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size
   1663 )
-> 1664 return inner_training_loop(
   1665     args=args,
   1666     resume_from_checkpoint=resume_from_checkpoint,
   1667     trial=trial,
   1668     ignore_keys_for_eval=ignore_keys_for_eval,
   1669 )

File ~\anaconda3\envs\pt\lib\site-packages\transformers\trainer.py:1909, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
   1906     rng_to_sync = True
   1908 step = -1
-> 1909 for step, inputs in enumerate(epoch_iterator):
   1910     total_batched_samples += 1
   1911     if rng_to_sync:

File ~\anaconda3\envs\pt\lib\site-packages\torch\utils\data\dataloader.py:633, in _BaseDataLoaderIter.__next__(self)
    630 if self._sampler_iter is None:
    631     # TODO(https://github.com/pytorch/pytorch/issues/76750)
    632     self._reset()  # type: ignore[call-arg]
--> 633 data = self._next_data()
    634 self._num_yielded += 1
    635 if self._dataset_kind == _DatasetKind.Iterable and \
    636         self._IterableDataset_len_called is not None and \
    637         self._num_yielded > self._IterableDataset_len_called:

File ~\anaconda3\envs\pt\lib\site-packages\torch\utils\data\dataloader.py:677, in _SingleProcessDataLoaderIter._next_data(self)
    675 def _next_data(self):
    676     index = self._next_index()  # may raise StopIteration
--> 677     data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
    678     if self._pin_memory:
    679         data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)

File ~\anaconda3\envs\pt\lib\site-packages\torch\utils\data\_utils\fetch.py:54, in _MapDatasetFetcher.fetch(self, possibly_batched_index)
     52 else:
     53     data = self.dataset[possibly_batched_index]
---> 54 return self.collate_fn(data)

File ~\anaconda3\envs\pt\lib\site-packages\transformers\trainer_utils.py:704, in RemoveColumnsCollator.__call__(self, features)
    702 def __call__(self, features: List[dict]):
    703     features = [self._remove_columns(feature) for feature in features]
--> 704     return self.data_collator(features)

File ~\anaconda3\envs\pt\lib\site-packages\transformers\data\data_collator.py:249, in DataCollatorWithPadding.__call__(self, features)
    248 def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
--> 249     batch = self.tokenizer.pad(
    250         features,
    251         padding=self.padding,
    252         max_length=self.max_length,
    253         pad_to_multiple_of=self.pad_to_multiple_of,
    254         return_tensors=self.return_tensors,
    255     )
    256     if "label" in batch:
    257         batch["labels"] = batch["label"]

File ~\anaconda3\envs\pt\lib\site-packages\transformers\tokenization_utils_base.py:2966, in PreTrainedTokenizerBase.pad(self, encoded_inputs, padding, max_length, pad_to_multiple_of, return_attention_mask, return_tensors, verbose)
   2962 # The model's main input name, usually `input_ids`, has be passed for padding
   2963 if self.model_input_names[0] not in encoded_inputs:
   2964     raise ValueError(
   2965         "You should supply an encoding or a list of encodings to this method "
-> 2966         f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
   2967     )
   2969 required_input = encoded_inputs[self.model_input_names[0]]
   2971 if required_input is None or (isinstance(required_input, Sized) and len(required_input) == 0):

AttributeError: 'list' object has no attribute 'keys'

I was trying to perform text classification and wanted to fine tune the model before using it to make predictions.

Nick ODell
  • 15,465
  • 3
  • 32
  • 66

0 Answers0