Im working on a Multiclass Classification (Bengali Language Sentiment Analysis) on a pretrained Huggingface (BertForMaskedLM) model.
When the error occured I knew I have to change the label(output) size to match the input. But do not know how. Im adding the code snippents below.
MAX_LEN = 200
BATCH_SIZE = 16
The pretrained models used:
from transformers import BertForMaskedLM, BertTokenizer, pipeline
model = BertForMaskedLM.from_pretrained("sagorsarker/bangla-bert-base")
tokenizer = BertTokenizer.from_pretrained("sagorsarker/bangla-bert-base")
Code to make the pytorch dataset:
class GPReviewDataset(Dataset):
def __init__(self, reviews, targets, tokenizer, max_len):
self.reviews = reviews
self.targets = targets
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
return len(self.reviews)
def __getitem__(self, item):
review = str(self.reviews[item])
target = self.targets[item]
encoding = self.tokenizer.encode_plus(
review,
add_special_tokens=True,
max_length=self.max_len,
truncation = True,
return_token_type_ids=False,
padding='max_length',
return_attention_mask=True,
return_tensors='pt',
)
return {
'review_text': review,
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'targets': torch.tensor(target, dtype=torch.long)
}
The input dimentions are:
print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data['targets'].shape)
Which Outputs:
torch.Size([16, 200])
torch.Size([16, 200])
torch.Size([16])
Training Class
def train_epoch(model, data_loader, optimizer, device, scheduler, n_examples):
model = model.train() # tells your model that we are training
losses = []
correct_predictions = 0
for d in data_loader:
input_ids = d["input_ids"].to(device)
attention_mask = d["attention_mask"].to(device)
targets = d["targets"].to(device)
loss, logits = model(
input_ids=input_ids,
attention_mask=attention_mask,
labels = targets
)
#logits = classification scores befroe softmax
#loss = classification loss
logits = logits.view(-1, 28*28).detach().cpu().numpy()
label_ids = targets.to('cpu').numpy()
preds = np.argmax(logits, axis=1).flatten() #returns indices of maximum logit
targ = label_ids.flatten()
correct_predictions += np.sum(preds == targ)
losses.append(loss.item())
loss.backward() # performs backpropagation(computes derivates of loss w.r.t to parameters)
nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) #clipping gradients so they dont explode
optimizer.step() #After gradients are computed by loss.backward() this makes the optimizer iterate over all parameters it is supposed to update and use internally #stored grad to update their values
scheduler.step() # this will make sure learning rate changes. If we dont provide this learning rate stays at initial value
optimizer.zero_grad() # clears old gradients from last step
return correct_predictions / n_examples, np.mean(losses)
Where the training Starts (Where the error triggers):
%%time
# standard block
# used accuracy as metric here
history = defaultdict(list)
best_acc = 0
for epoch in range(EPOCHS):
print(f'Epoch {epoch + 1}/{EPOCHS}')
print('-' * 10)
train_acc, train_loss = train_epoch(model, train_data_loader, optimizer, device, scheduler, len(df_train))
print(f'Train loss {train_loss} Accuracy {train_acc}')
val_acc, val_loss = eval_model(model, valid_data_loader, device, len(df_valid))
print(f'Val loss {val_loss} Accuracy {val_acc}')
print()
history['train_acc'].append(train_acc)
history['train_loss'].append(train_loss)
history['val_acc'].append(val_acc)
history['val_loss'].append(val_loss)
if val_acc > best_acc:
torch.save(model.state_dict(), 'best_model_state_a5.bin')
best_acc = val_acc
The error:
Epoch 1/5
----------
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-41-fb5a4d77ce37> in <module>()
----> 1 get_ipython().run_cell_magic('time', '', "# standard block\n# used accuracy as metric here\nhistory = defaultdict(list)\n\nbest_acc = 0\n\nfor epoch in range(EPOCHS):\n\n print(f'Epoch {epoch + 1}/{EPOCHS}')\n print('-' * 10)\n\n train_acc, train_loss = train_epoch(model, train_data_loader, optimizer, device, scheduler, len(df_train))\n\n print(f'Train loss {train_loss} Accuracy {train_acc}')\n\n val_acc, val_loss = eval_model(model, valid_data_loader, device, len(df_valid))\n\n print(f'Val loss {val_loss} Accuracy {val_acc}')\n print()\n\n history['train_acc'].append(train_acc)\n history['train_loss'].append(train_loss)\n history['val_acc'].append(val_acc)\n history['val_loss'].append(val_loss)\n\n if val_acc > best_acc:\n torch.save(model.state_dict(), 'best_model_state_a5.bin')\n best_acc = val_acc\n\n# We are storing state of best model indicated by highest validation accuracy")
8 frames
/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py in run_cell_magic(self, magic_name, line, cell)
2115 magic_arg_s = self.var_expand(line, stack_depth)
2116 with self.builtin_trap:
-> 2117 result = fn(magic_arg_s, cell)
2118 return result
2119
<decorator-gen-53> in time(self, line, cell, local_ns)
/usr/local/lib/python3.7/dist-packages/IPython/core/magic.py in <lambda>(f, *a, **k)
186 # but it's overkill for just that one bit of state.
187 def magic_deco(arg):
--> 188 call = lambda f, *a, **k: f(*a, **k)
189
190 if callable(arg):
/usr/local/lib/python3.7/dist-packages/IPython/core/magics/execution.py in time(self, line, cell, local_ns)
1191 else:
1192 st = clock2()
-> 1193 exec(code, glob, local_ns)
1194 end = clock2()
1195 out = None
<timed exec> in <module>()
<ipython-input-39-948eefef2f8d> in train_epoch(model, data_loader, optimizer, device, scheduler, n_examples)
13 input_ids=input_ids,
14 attention_mask=attention_mask,
---> 15 labels = targets
16 )
17
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1050 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051 return forward_call(*input, **kwargs)
1052 # Do not call functions when jit is used
1053 full_backward_hooks, non_full_backward_hooks = [], []
/usr/local/lib/python3.7/dist-packages/transformers/models/bert/modeling_bert.py in forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, labels, output_attentions, output_hidden_states, return_dict)
1327 if labels is not None:
1328 loss_fct = CrossEntropyLoss() # -100 index = padding token
-> 1329 masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
1330
1331 if not return_dict:
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1050 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051 return forward_call(*input, **kwargs)
1052 # Do not call functions when jit is used
1053 full_backward_hooks, non_full_backward_hooks = [], []
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/loss.py in forward(self, input, target)
1119 def forward(self, input: Tensor, target: Tensor) -> Tensor:
1120 return F.cross_entropy(input, target, weight=self.weight,
-> 1121 ignore_index=self.ignore_index, reduction=self.reduction)
1122
1123
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py in cross_entropy(input, target, weight, size_average, ignore_index, reduce, reduction)
2822 if size_average is not None or reduce is not None:
2823 reduction = _Reduction.legacy_get_string(size_average, reduce)
-> 2824 return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
2825
2826
ValueError: Expected input batch_size (3200) to match target batch_size (16).