Want to fine tune pretrained RoBERTa, from Huggingface, on my own data for text summarization

Question

I am a beginner in this. Please help me in getting a solution. I have used the RobertaTokenizerFast to tokenize the text and summary (max_token_length 200 and 50 respectively). The plan is to use RoBERTa as the first layer. Then condense its output to match the target summary using conv2d, maxpool2d, and dense. The output of the last dense layer is a float vector. So, I have normalized the target vector, containing long input_ids, into float values (0 to 1). Finally, I have used CrossEntropy function to get the loss.

class Summarizer(pl.LightningModule):
  def __init__(self):
    super().__init__()
    self.roberta = RobertaModel.from_pretrained('roberta-base', return_dict = True, is_decoder=True, use_cache=False) 
    self.convlayer = torch.nn.Conv2d(in_channels=BATCH_SIZE, out_channels=1, kernel_size=4) 
                                              ## BATCH_SIZE=20
    self.relu = torch.nn.ReLU()
    self.fc = torch.nn.Linear(in_features=97*381, out_features=50)
    self.cross_entropy_loss = torch.nn.CrossEntropyLoss()

  def forward(self, input_ids, attention_mask, labels):
    output = self.roberta(input_ids=input_ids, attention_mask=attention_mask) 
    x = output['last_hidden_state']
    x = torch.unsqueeze(x, 0)
    x = self.convlayer(x)
    x = self.relu(x)
    x = F.max_pool2d(x, kernel_size=4, stride=2)
    x = x.squeeze().flatten()
    x = self.fc(x)
    output = self.relu(x)
    crossent_loss = self.cross_entropy_loss(labels, output)
    return crossent_loss, output
  
  def training_step(self, batch, batch_idx):
    input_ids = batch['text_input_ids']
    attention_mask = batch['text_attention_mask']

    l = batch['labels'].float()
    l = torch.tensor(l/torch.linalg.norm(l))

    labels = l # normalized labels in (0,1)
    labels_attention_mask = batch['labels_attention_mask']


    loss, outputs = self(
                         input_ids = input_ids,
                         attention_mask = attention_mask,
                         labels = labels
                         )
    self.log('train_loss', loss, prog_bar = True, logger = True)
    return loss

  def validation_step(self, batch, batch_idx): 
    input_ids = batch['text_input_ids']
    attention_mask = batch['text_attention_mask']

    l = batch['labels'].float()
    l = torch.tensor(l/torch.linalg.norm(l))
    
    labels = l
    labels_attention_mask = batch['labels_attention_mask']

    loss, outputs = self(
                         input_ids = input_ids,
                         attention_mask = attention_mask,
                         labels = labels
                         )
    self.log('val_loss', loss, prog_bar = True, logger = True)
    return loss

  def test_step(self, batch, batch_idx):
    input_ids = batch['text_input_ids']
    attention_mask = batch['text_attention_mask']
    
    l = batch['labels'].float()
    l = torch.tensor(l/torch.linalg.norm(l))
    
    labels = l
    labels_attention_mask = batch['labels_attention_mask']

    loss, outputs = self(
                         input_ids = input_ids,
                         attention_mask = attention_mask,
                         labels = labels
                         )
    self.log('test_loss', loss, prog_bar = True, logger = True)
    return loss

  def configure_optimizers(self):
    return AdamW(self.parameters(), lr=0.0001)

The training using pl.Trainer returns ValueError: Expected input batch_size (20) to match target batch_size (50). I could not get the error.

Text summarisation is a seq2seq problem, what your doing is closer to classification. You can take a look at this https://huggingface.co/transformers/model_doc/encoderdecoder.html, to make a custom encoder decoder model — Edwin Cheong, Oct 14 '21 at 07:12
If you have a time constraint, try using this library which wraps the transformers library https://simpletransformers.ai/docs/seq2seq-model/ — Edwin Cheong, Oct 14 '21 at 07:14

Want to fine tune pretrained RoBERTa, from Huggingface, on my own data for text summarization

0 Answers0