I am trying to perform text classification using GPTNeo, using the tweet_eval dataset from huggingface. I am following this example https://huggingface.co/docs/transformers/tasks/sequence_classification, but there is some error. I am a beginner at LLMs and it will be very helpful if someone can help me solve the issue. Thanks in advance. This is my code:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
import datasets
import torch as t
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np
dataset = datasets.load_dataset("tweet_eval","emotion")
x_train = dataset["train"]["text"]
y_train = dataset["train"]["label"]
x_test = dataset["test"]["text"]
y_test = dataset["test"]["label"]
def load_LLM(llm, device):
num_labels = 4
id2label = {0: "Anger", 1: "Joy", 2: "Optimism", 3: "Sadness"}
label2id = {"Anger": 0, "Joy": 1, "Optimism": 2, "Sadness":3}
model = AutoModelForSequenceClassification.from_pretrained(llm,num_labels=num_labels,id2label=id2label, label2id=label2id)
model.to(device)
tokenizer = AutoTokenizer.from_pretrained(llm)
return model, tokenizer
llm = "EleutherAI/gpt-neo-2.7B"
device = t.device('cuda' if t.cuda.is_available() else 'cpu')
model,tokenizer = load_LLM(llm,device)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = '[PAD]'
train_inputs = tokenizer(x_train, truncation=True, padding=True)
test_inputs = tokenizer(x_test, truncation=True, padding=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return accuracy.compute(predictions=predictions, references=labels)
training_args = TrainingArguments(
output_dir="my_awesome_model",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=2,
weight_decay=0.01,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
push_to_hub=True
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_inputs,
eval_dataset=test_inputs,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics
)
trainer.train()
I am getting this error:
type here---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
Cell In[18], line 1
----> 1 trainer.train()
File ~\anaconda3\envs\pt\lib\site-packages\transformers\trainer.py:1664, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1659 self.model_wrapped = self.model
1661 inner_training_loop = find_executable_batch_size(
1662 self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size
1663 )
-> 1664 return inner_training_loop(
1665 args=args,
1666 resume_from_checkpoint=resume_from_checkpoint,
1667 trial=trial,
1668 ignore_keys_for_eval=ignore_keys_for_eval,
1669 )
File ~\anaconda3\envs\pt\lib\site-packages\transformers\trainer.py:1909, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
1906 rng_to_sync = True
1908 step = -1
-> 1909 for step, inputs in enumerate(epoch_iterator):
1910 total_batched_samples += 1
1911 if rng_to_sync:
File ~\anaconda3\envs\pt\lib\site-packages\torch\utils\data\dataloader.py:633, in _BaseDataLoaderIter.__next__(self)
630 if self._sampler_iter is None:
631 # TODO(https://github.com/pytorch/pytorch/issues/76750)
632 self._reset() # type: ignore[call-arg]
--> 633 data = self._next_data()
634 self._num_yielded += 1
635 if self._dataset_kind == _DatasetKind.Iterable and \
636 self._IterableDataset_len_called is not None and \
637 self._num_yielded > self._IterableDataset_len_called:
File ~\anaconda3\envs\pt\lib\site-packages\torch\utils\data\dataloader.py:677, in _SingleProcessDataLoaderIter._next_data(self)
675 def _next_data(self):
676 index = self._next_index() # may raise StopIteration
--> 677 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
678 if self._pin_memory:
679 data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)
File ~\anaconda3\envs\pt\lib\site-packages\torch\utils\data\_utils\fetch.py:54, in _MapDatasetFetcher.fetch(self, possibly_batched_index)
52 else:
53 data = self.dataset[possibly_batched_index]
---> 54 return self.collate_fn(data)
File ~\anaconda3\envs\pt\lib\site-packages\transformers\trainer_utils.py:704, in RemoveColumnsCollator.__call__(self, features)
702 def __call__(self, features: List[dict]):
703 features = [self._remove_columns(feature) for feature in features]
--> 704 return self.data_collator(features)
File ~\anaconda3\envs\pt\lib\site-packages\transformers\data\data_collator.py:249, in DataCollatorWithPadding.__call__(self, features)
248 def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
--> 249 batch = self.tokenizer.pad(
250 features,
251 padding=self.padding,
252 max_length=self.max_length,
253 pad_to_multiple_of=self.pad_to_multiple_of,
254 return_tensors=self.return_tensors,
255 )
256 if "label" in batch:
257 batch["labels"] = batch["label"]
File ~\anaconda3\envs\pt\lib\site-packages\transformers\tokenization_utils_base.py:2966, in PreTrainedTokenizerBase.pad(self, encoded_inputs, padding, max_length, pad_to_multiple_of, return_attention_mask, return_tensors, verbose)
2962 # The model's main input name, usually `input_ids`, has be passed for padding
2963 if self.model_input_names[0] not in encoded_inputs:
2964 raise ValueError(
2965 "You should supply an encoding or a list of encodings to this method "
-> 2966 f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
2967 )
2969 required_input = encoded_inputs[self.model_input_names[0]]
2971 if required_input is None or (isinstance(required_input, Sized) and len(required_input) == 0):
AttributeError: 'list' object has no attribute 'keys'
I was trying to perform text classification and wanted to fine tune the model before using it to make predictions.