Using pythorch and transformers library I am trying to user bert-base-cased for a regression task.
This is how I implement the dataset
class CustomDataset(Dataset):
def __init__(self, data, maxlen, tokenizer, target_cols):
self.df = data.reset_index()
self.tokenizer = tokenizer
self.maxlen = maxlen
self.target_cols = target_cols
def __len__(self):
return self.df.shape[0]
def __getitem__(self, index):
full_text = self.df.loc[index, "text"]
# Preprocess the text to be suitable for the transformer
tokens = self.tokenizer.tokenize(full_text)
tokens = ['[CLS]'] + tokens + ['[SEP]']
if len(tokens) < self.maxlen:
tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))]
else:
tokens = tokens[:self.maxlen - 1] + ['[SEP]']
# Obtain the indices of the tokens in the BERT Vocabulary
input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor(input_ids)
# Obtain the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
attention_mask = (input_ids != 0).long()
item = {}
try:
target = self.df.loc[index, self.target_cols]
except Exception as e:
raise e
target = torch.tensor(target, dtype=torch.float32)
item["input_ids"] = input_ids
item["attention_mask"] = attention_mask
item["target"] = target
return item
While these are the model and the custom trainer with a custom loss
class RegressorModel(nn.Module):
def __init__(self, config):
super(RegressorModel, self).__init__()
self.model_name = config['model']
self.freeze = config['freeze_encoder']
self.encoder = AutoModel.from_pretrained(self.model_name)
if self.freeze:
for param in self.encoder.base_model.parameters():
param.requires_grad = False
self.cls_layer1 = nn.Linear(self.encoder.config.hidden_size, 128)
self.relu1 = nn.ReLU()
self.ff1 = nn.Linear(128, 2)
def forward(self, input_ids, attention_mask):
# Feed the input to Bert model to obtain contextualized representations
outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
# Obtain the representations of [CLS] heads
logits = outputs.last_hidden_state[:, 0, :]
output = self.cls_layer1(logits)
output = self.relu1(output)
output = self.ff1(output)
return output
class CustomTrainer(Trainer):
def __int__(self, *args, **kwargs):
super().__int__(*args, **kwargs)
def compute_loss(self, model, inputs, return_outputs=False):
targets = inputs["target"]
outputs = model(**inputs)
colwise_mse = torch.mean(torch.square(targets - outputs), dim=0)
loss = torch.mean(torch.sqrt(colwise_mse), dim=0)
return (loss, outputs) if return_outputs else loss
finally I start the training with
config = {
'model': 'bert-base-cased',
'max_length': 512,
'freeze_encoder': True
}
tokenizer = AutoTokenizer.from_pretrained(config['model'])
target_cols = ["content", "wording"]
train_set = CustomDataset(data=train, maxlen=config['max_length'], tokenizer=tokenizer, target_cols=target_cols)
valid_set = CustomDataset(data=val, maxlen=config['max_length'], tokenizer=tokenizer, target_cols=target_cols)
test_set = CustomDataset(data=test, maxlen=config['max_length'], tokenizer=tokenizer, target_cols=target_cols)
print("Train set", next(iter(train_set))) #here I correctly see that I get a dictionary with 3 keys, including the target
model = RegressorModel(config).to("cuda")
default_args = {
"output_dir": "tmp",
"evaluation_strategy": "steps",
"num_train_epochs": 5,
"log_level": "error",
"report_to": "none",
"full_determinism": False
}
training_args = TrainingArguments(per_device_train_batch_size=4, **default_args)
trainer = CustomTrainer(model=model, args=training_args,
train_dataset=train_set, eval_dataset=valid_set)
result = trainer.train()
The problem that I get is compute_loss when I am trying to get the targets since inputs contains only inputs_ids and attention mask
Traceback (most recent call last):
File "/storagenfs/us/Challenge/AccelerateBERT.py", line 66, in <module>
result = trainer.train()
File "/storagenfs/us/.virtualenvs/Challenge/lib/python3.8/site-packages/transformers/trainer.py", line 1539, in train
return inner_training_loop(
File "/storagenfs/us/.virtualenvs/Challenge/lib/python3.8/site-packages/transformers/trainer.py", line 1809, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/storagenfs/us/.virtualenvs/Challenge/lib/python3.8/site-packages/transformers/trainer.py", line 2654, in training_step
loss = self.compute_loss(model, inputs)
File "/storagenfs/us/Challenge/utils/utils.py", line 116, in compute_loss
targets = inputs["target"]
KeyError: 'target'
So how to use correctly a custom trainer to pass the targets too?