0

Using pythorch and transformers library I am trying to user bert-base-cased for a regression task.

This is how I implement the dataset

class CustomDataset(Dataset):
    def __init__(self, data, maxlen, tokenizer, target_cols):
        self.df = data.reset_index()
        self.tokenizer = tokenizer
        self.maxlen = maxlen

        self.target_cols = target_cols

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):
        full_text = self.df.loc[index, "text"]
     
        # Preprocess the text to be suitable for the transformer
        tokens = self.tokenizer.tokenize(full_text)
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        if len(tokens) < self.maxlen:
            tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))]
        else:
            tokens = tokens[:self.maxlen - 1] + ['[SEP]']

        # Obtain the indices of the tokens in the BERT Vocabulary
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        input_ids = torch.tensor(input_ids)
        # Obtain the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
        attention_mask = (input_ids != 0).long()

        item = {}

        try:
            target = self.df.loc[index, self.target_cols]
        except Exception as e:
            raise e

        target = torch.tensor(target, dtype=torch.float32)
        item["input_ids"] = input_ids

        item["attention_mask"] = attention_mask
        item["target"] = target

        return item

While these are the model and the custom trainer with a custom loss

class RegressorModel(nn.Module):
    def __init__(self, config):
        super(RegressorModel, self).__init__()
        self.model_name = config['model']

        self.freeze = config['freeze_encoder']

        self.encoder = AutoModel.from_pretrained(self.model_name)
        if self.freeze:
            for param in self.encoder.base_model.parameters():
                param.requires_grad = False

        self.cls_layer1 = nn.Linear(self.encoder.config.hidden_size, 128)
        self.relu1 = nn.ReLU()
        self.ff1 = nn.Linear(128, 2)

    def forward(self, input_ids, attention_mask):
        # Feed the input to Bert model to obtain contextualized representations
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        # Obtain the representations of [CLS] heads
        logits = outputs.last_hidden_state[:, 0, :]
        output = self.cls_layer1(logits)
        output = self.relu1(output)
        output = self.ff1(output)
        return output

class CustomTrainer(Trainer):
    def __int__(self, *args, **kwargs):
        super().__int__(*args, **kwargs)
    def compute_loss(self, model, inputs, return_outputs=False):
        targets = inputs["target"]
        outputs = model(**inputs)
        colwise_mse = torch.mean(torch.square(targets - outputs), dim=0)
        loss = torch.mean(torch.sqrt(colwise_mse), dim=0)
        return (loss, outputs) if return_outputs else loss

finally I start the training with

config = {
    'model': 'bert-base-cased',
    'max_length': 512,
    'freeze_encoder': True
}


tokenizer = AutoTokenizer.from_pretrained(config['model'])
target_cols = ["content", "wording"]
train_set = CustomDataset(data=train, maxlen=config['max_length'], tokenizer=tokenizer, target_cols=target_cols)
valid_set = CustomDataset(data=val, maxlen=config['max_length'], tokenizer=tokenizer, target_cols=target_cols)
test_set = CustomDataset(data=test, maxlen=config['max_length'], tokenizer=tokenizer, target_cols=target_cols)

print("Train set", next(iter(train_set))) #here I correctly see that I get a dictionary with 3 keys, including the target

model = RegressorModel(config).to("cuda")
default_args = {
    "output_dir": "tmp",
    "evaluation_strategy": "steps",
    "num_train_epochs": 5,
    "log_level": "error",
    "report_to": "none",
    "full_determinism": False
}
training_args = TrainingArguments(per_device_train_batch_size=4, **default_args)
trainer = CustomTrainer(model=model, args=training_args,
                        train_dataset=train_set, eval_dataset=valid_set)
result = trainer.train()

The problem that I get is compute_loss when I am trying to get the targets since inputs contains only inputs_ids and attention mask

Traceback (most recent call last):
  File "/storagenfs/us/Challenge/AccelerateBERT.py", line 66, in <module>
    result = trainer.train()
  File "/storagenfs/us/.virtualenvs/Challenge/lib/python3.8/site-packages/transformers/trainer.py", line 1539, in train
    return inner_training_loop(
  File "/storagenfs/us/.virtualenvs/Challenge/lib/python3.8/site-packages/transformers/trainer.py", line 1809, in _inner_training_loop
    tr_loss_step = self.training_step(model, inputs)
  File "/storagenfs/us/.virtualenvs/Challenge/lib/python3.8/site-packages/transformers/trainer.py", line 2654, in training_step
    loss = self.compute_loss(model, inputs)
  File "/storagenfs/us/Challenge/utils/utils.py", line 116, in compute_loss
    targets = inputs["target"]
KeyError: 'target'

So how to use correctly a custom trainer to pass the targets too?

JayJona
  • 469
  • 1
  • 16
  • 41

0 Answers0