I'm working on a NLP classification problem where I'm trying to classify training courses into 99 categories. I managed to make a few models including the Bayesian classifier but it had an accuracy of 55% (very bad).
Given those results, I tried to fine-tune the camemBERT model (my data is in french) to improve the model results but I never used these methods before so I tried to follow this example and adapt it to my code.
In the example above, there are 2 labels while I have 99 labels.
I left certain parts intact
epochs = 5
MAX_LEN = 128
batch_size = 16
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = CamembertTokenizer.from_pretrained('camembert-base',do_lower_case=True)
I selected the same variable names, in text you have the feature column and in labels you have the labels
text = training['Intitulé (Ce champ doit respecter la nomenclature suivante : Code action – Libellé)_x']
labels = training['Domaine sou domaine ']
I tokenized and padded the sequences using the same values in the example because I didn't know which values are right for my data
#user tokenizer to convert sentences into tokenizer
input_ids = [tokenizer.encode(sent, add_special_tokens=True, max_length=MAX_LEN) for sent in text]
# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
# Create attention masks
attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
seq_mask = [float(i > 0) for i in seq]
attention_masks.append(seq_mask)
I noticed that the labels are numeric in the example above so I changed my labels to numeric using this code
label_map = {label: i for i, label in enumerate(set(labels))}
numeric_labels = [label_map[label] for label in labels]
labels = numeric_labels
I started building the model starting with the tensors
# Use train_test_split to split our data into train and validation sets for training
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(
input_ids, labels, random_state=42, test_size=0.1
)
train_masks, validation_masks = train_test_split(
attention_masks, random_state=42, test_size=0.1
)
# Convert the data to torch tensors
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)
# Create data loaders
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)
# Define the model architecture
model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=99)
# Move the model to the appropriate device
model.to(device)
the output is:
CamembertForSequenceClassification(
(roberta): RobertaModel(
(embeddings): RobertaEmbeddings(
(word_embeddings): Embedding(32005, 768, padding_idx=1)
(position_embeddings): Embedding(514, 768, padding_idx=1)
(token_type_embeddings): Embedding(1, 768)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(encoder): RobertaEncoder(
(layer): ModuleList(
(0-11): 12 x RobertaLayer(
(attention): RobertaAttention(
(self): RobertaSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): RobertaSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): RobertaIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): RobertaOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
)
)
(classifier): RobertaClassificationHead(
(dense): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(out_proj): Linear(in_features=768, out_features=99, bias=True)
)
)
Then I proceeded with creating the neural network
param_optimizer = list(model.named_parameters())
optimizer_grouped_parameters = [{'params': [p for n, p in param_optimizer], 'weight_decay_rate': 0.01}]
optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, eps=10e-8)
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
pred_flat = np.argmax(preds, axis=1).flatten()
labels_flat = labels.flatten()
return np.sum(pred_flat == labels_flat) / len(labels_flat)
train_loss_set = []
# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):
# Tracking variables for training
tr_loss = 0
nb_tr_examples, nb_tr_steps = 0, 0
# Train the model
model.train()
for step, batch in enumerate(train_dataloader):
# Add batch to device CPU or GPU
batch = tuple(t.to(device) for t in batch)
# Unpack the inputs from our dataloader
b_input_ids, b_input_mask, b_labels = batch
# Clear out the gradients (by default they accumulate)
optimizer.zero_grad()
# Forward pass
outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
# Get loss value
loss = outputs[0]
# Add it to train loss list
train_loss_set.append(loss.item())
# Backward pass
loss.backward()
# Update parameters and take a step using the computed gradient
optimizer.step()
# Update tracking variables
tr_loss += loss.item()
nb_tr_examples += b_input_ids.size(0)
nb_tr_steps += 1
print("Train loss: {}".format(tr_loss / nb_tr_steps))
# Tracking variables for validation
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
# Validation of the model
model.eval()
# Evaluate data for one epoch
for batch in validation_dataloader:
# Add batch to device CPU or GPU
batch = tuple(t.to(device) for t in batch)
# Unpack the inputs from our dataloader
b_input_ids, b_input_mask, b_labels = batch
# Telling the model not to compute or store gradients, saving memory and speeding up validation
with torch.no_grad():
# Forward pass, calculate logit predictions
outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
loss, logits = outputs[:2]
# Move logits and labels to CPU if GPU is used
logits = logits.detach().cpu().numpy()
label_ids = b_labels.to('cpu').numpy()
tmp_eval_accuracy = flat_accuracy(logits, label_ids)
eval_accuracy += tmp_eval_accuracy
nb_eval_steps += 1
print("Validation Accuracy: {}".format(eval_accuracy / nb_eval_steps))
And the code worked, but the accuracy level was at 30%, which is way worse than a Bayesian classifier that uses a very simple algorithm and straightforward calculation. This made me realize that I must have fine-tuned the model wrongly, but I don't understand fine-tuning well enough to know where I went wrong.