I have been having trouble with the bi-lastm-cfr model. I tried several fixes for different bugs but now i am stuck.
from transformers import AutoTokenizer, AutoModel
import torch.nn as nn
import torch
import numpy as np
from torchcrf import CRF
class CRFBiLSTMModel(nn.Module):
def __init__(self, n_class, bert):
super(CRFBiLSTMModel, self).__init__()
# Load the pre-trained tokenizer and model
# self.tokenizer = AutoTokenizer.from_pretrained(model_name)
# self.model = AutoModel.from_pretrained(model_name)
self.bert = bert
self.n_class = n_class
self.dropout_rate = 0.2
self.lstm_hidden_size = self.bert.config.hidden_size
# Add a Bi-LSTM layer
self.lstm = nn.LSTM(
input_size=self.bert.config.hidden_size,
hidden_size=self.lstm_hidden_size,
num_layers=2,
batch_first=True,
# dropout=0.3,
bidirectional=True,
)
# self.lstm = nn.LSTM(self.lstm_hidden_size,
# self.lstm_hidden_size, bidirectional=True)
# Add a linear layer/classifier
# self.linear = nn.Linear(256, n_class)
self.linear = nn.Linear(self.lstm_hidden_size * 2, self.n_class, bias=True)
# self.dropout = nn.Dropout(p=self.dropout_rate)
# Add a CRF layer
self.crf = CRF(n_class)
def forward(self, batch):
# Get the model embeddings
b_input_ids = batch[0]
b_input_mask = batch[1]
outputs = self.bert(b_input_ids, attention_mask=b_input_mask)
embeddings = outputs.last_hidden_state
# Add a mask to the input sequence
mask = b_input_mask.unsqueeze(-1).repeat(1, 1, self.lstm_hidden_size * 2)
# Convert mask to binary mask
mask = mask.byte()
# print("debug", mask[0].shape)
# # Check if the first timestep has a zero mask value, and if so, set it to 1
# if not (mask[0].all()):
# mask.fill_(1)
# Pass the embeddings through the Bi-LSTM layer
lstm_outputs, _ = self.lstm(embeddings)
# Apply the mask to the LSTM outputs
masked_lstm_outputs = lstm_outputs.masked_fill(mask == 1, float('-inf'))
# Pass the masked LSTM outputs through the linear layer
logits = self.linear(masked_lstm_outputs)
logits = torch.nan_to_num(logits, nan=1.0)
# print("debug logits", logits)
# Apply the CRF layer
try:
predicted_labels = self.crf.decode(logits, b_input_mask)
print("debug predicted_labels 1", predicted_labels)
return (predicted_labels,)
except ValueError as error:
if str(error) == "mask of the first timestep must all be on":
print("hello")
b_input_mask[0].fill_(1)
predicted_labels = self.crf.decode(logits, b_input_mask.bool())
# predicted_labels = np.array([np.array(x) for x in predicted_labels])
print("debug predicted_labelsv2", predicted_labels)
self.labels = predicted_labels
return (predicted_labels,)
else:
print("debug: else crf_bilstm.py line 80")
exit()
class Train:
def format_time(self, elapsed):
'''
Takes a time in seconds and returns a string hh:mm:ss
'''
# Round to the nearest second.
elapsed_rounded = int(round((elapsed)))
# Format as hh:mm:ss
return str(datetime.timedelta(seconds=elapsed_rounded))
def fit(self, model, train_dataloader, validation_dataloader, epochs, device, optimizer, scheduler, criterion, writer, print_each=40):
# Set the seed value all ovr the place to make this reproducible.
seed_val = 2
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
model_save_path = 'tmp'
loss_values = []
hist_valid_scores = []
# For each epoch...
for epoch_i in range(0, epochs):
logs = {}
# ========================================
# Training
# ========================================
# Perform one full pass over the training set.
print("")
print(
'======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
print('Training...')
# Measure how long the training epoch takes.
t0 = time.time()
# Reset the total loss for this epoch.
total_loss = 0
total_accuracy = 0
# Put the model into training mode. Don't be mislead--the call to
# `train` just changes the *mode*, it doesn't *perform* the training.
model.train()
# For each batch of training data...
for step, batch in enumerate(train_dataloader):
# Progress update every 40 batches.
if step % print_each == 0 and not step == 0:
# Calculate elapsed time in minutes.
elapsed = self.format_time(time.time() - t0)
# Report progress.
print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(
step, len(train_dataloader), elapsed))
# move batch data to device (cpu or gpu)
batch = tuple(t.to(device) for t in batch)
# `batch` contains three pytorch tensors:
# [0]: input ids
# [1]: attention masks
# [2]: labels
model.zero_grad()
outputs = model(batch)
# The call to `model` always returns a tuple, so we need to pull the
# loss value out of the tuple.
logits = outputs[0]
label_ids = batch[-1]
# print(logits)
if isinstance(logits, list):
logits = torch.FloatTensor(logits[0])
label_ids = np.array([np.array(x) for x in label_ids])
label_ids = torch.FloatTensor(label_ids)#stoped here because criteroin expect float but this is long
print("kjgkjgjg", len(label_ids), logits.view(-1),
label_ids[-1])
loss = criterion(logits.view(-1),
label_ids.view(-1))
# logits = np.argmax(logits, axis=1)
# label_ids = np.argmax(label_ids, axis=1)
else:
loss = criterion(logits.view(-1, model.n_class),
label_ids.view(-1))
# Move logits back to cpu for metrics calculations
logits = logits.detach().cpu().numpy()
label_ids = label_ids.to('cpu').numpy()
# Calculate the accuracy for this batch of test sentences.
print(logits, label_ids)
current_accuracy = flat_accuracy(logits, label_ids)
total_accuracy += current_accuracy
# Accumulate the training loss over all of the batches so that we can
# calculate the average loss at the end. `loss` is a Tensor containing a
# single value; the `.item()` function just returns the Python value
# from the tensor.
total_loss += loss.item()
writer.add_scalar('training loss',
loss.item(),
epoch_i * len(train_dataloader)+step)
writer.add_scalar('training Accuracy',
current_accuracy,
epoch_i * len(train_dataloader)+step)
# Perform a backward pass to calculate the gradients.
loss.backward()
# Clip the norm of the gradients to 1.0.
# This is to help prevent the "exploding gradients" problem.
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# Update parameters and take a step using the computed gradient.
# The optimizer dictates the "update rule"--how the parameters are
# modified based on their gradients, the learning rate, etc.
optimizer.step()
# Update the learning rate.
scheduler.step()
# Calculate the average loss over the training data.
avg_train_loss = total_loss / len(train_dataloader)
avg_train_accuracy = total_accuracy / len(train_dataloader)
# Store the loss value for plotting the learning curve.
loss_values.append(avg_train_loss)
logs["log loss"] = avg_train_loss
logs["accuracy"] = avg_train_accuracy
print("")
print(" Average training loss: {0:.2f}".format(avg_train_loss))
print(" Training epcoh took: {:}".format(
self.format_time(time.time() - t0)))
# ========================================
# Validation
# ========================================
# After the completion of each training epoch, measure our performance on
# our validation set.
print("")
print("Running Validation...")
t0 = time.time()
# Put the model in evaluation mode--the dropout layers behave differently
# during evaluation.
model.eval()
# Tracking variables
eval_loss, eval_accuracy, eval_f1, eval_recall, eval_precesion = 0, 0, 0, 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
validation_loss = 0
# Evaluate data for one epoch
for step_valid, batch in enumerate(validation_dataloader):
# Add batch to GPU
batch = tuple(t.to(device) for t in batch)
# Unpack the inputs from our dataloader
# Telling the model not to compute or store gradients, saving memory and
# speeding up validation
with torch.no_grad():
# Forward pass, calculate logit predictions.
# This will return the logits rather than the loss because we have
# not provided labels.
# token_type_ids is the same as the "segment ids", which
# differentiates sentence 1 and 2 in 2-sentence tasks.
# The documentation for this `model` function is here:
model.eval()
# https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
outputs = model(batch)
# Get the "logits" output by the model. The "logits" are the output
# values prior to applying an activation function like the softmax.
logits = outputs[0]
label_ids = batch[-1]
validation_loss += criterion(logits.view(-1,
model.n_class), label_ids.view(-1))
# print(logits)
# Move logits and labels to CPU
logits = logits.detach().cpu().numpy()
label_ids = label_ids.to('cpu').numpy()
# Calculate the accuracy for this batch of test sentences.
tmp_eval_accuracy = flat_accuracy(logits, label_ids)
tmp_eval_f1 = flat_f1(logits, label_ids)
tmp_eval_recall = flat_recall(logits, label_ids)
tmp_eval_precision = flat_precision(logits, label_ids)
# Accumulate the total scores.
eval_accuracy += tmp_eval_accuracy
eval_f1 += tmp_eval_f1
eval_recall += tmp_eval_recall
eval_precesion += tmp_eval_precision
# Track the number of batches
nb_eval_steps += 1
validation_loss = validation_loss/len(validation_dataloader)
writer.add_scalar('validation Accuracy',
tmp_eval_accuracy,
epoch_i * len(validation_dataloader)+step_valid)
writer.add_scalar('validation F1',
tmp_eval_f1,
epoch_i * len(validation_dataloader)+step_valid)
writer.add_scalar('validation recall',
tmp_eval_recall,
epoch_i * len(validation_dataloader)+step_valid)
writer.add_scalar('validation precesion',
tmp_eval_precision,
epoch_i * len(validation_dataloader)+step_valid)
is_better = len(hist_valid_scores) == 0 or validation_loss < min(
hist_valid_scores)
hist_valid_scores.append(validation_loss)
if is_better:
patience = 0
print(
'save currently the best model to [%s]' % model_save_path, file=sys.stderr)
model.save(model_save_path)
# also save the optimizers' state
torch.save(optimizer.state_dict(), model_save_path + '.optim')
elif patience < 5:
patience += 1
'''print('hit patience %d' % patience, file=sys.stderr)
if patience == int(5):
# decay lr, and restore from previously best checkpoint
print('load previously best model and decay learning rate to ', file=sys.stderr)
# load model
params = torch.load(model_save_path, map_location=lambda storage, loc: storage)
model.load_state_dict(params['state_dict'])
model = model.to(torch.device("cuda"))
print('restore parameters of the optimizers', file=sys.stderr)
optimizer.load_state_dict(torch.load(model_save_path + '.optim'))
# reset patience
patience = 0'''
# Report the final accuracy for this validation run.
print(" Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
print(" F1: {0:.2f}".format(eval_f1/nb_eval_steps))
print(" Recall: {0:.2f}".format(eval_recall/nb_eval_steps))
print(" Precision: {0:.2f}".format(eval_precesion/nb_eval_steps))
print(" Validation took: {:}".format(
self.format_time(time.time() - t0)))
return (eval_accuracy/nb_eval_steps, eval_f1/nb_eval_steps, eval_recall/nb_eval_steps, eval_precesion/nb_eval_steps,)
import sys
sys.path.append('../library')
import warnings
warnings.filterwarnings('ignore')
import gc
from crf_bilstm import CRFBiLSTMModel
from transformers import BertModel,BertTokenizer,FlaubertTokenizer, FlaubertModel,AutoTokenizer, BertForSequenceClassification , FlaubertForSequenceClassification
from transformers.modeling_utils import SequenceSummary
from torch import nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import re
from nlp.models import BasicBertForClassification
from nlp.training import Train
from nlp.preprocessing import TextPreprocessing
from nlp.feature_extraction import MetaFeaturesExtraction
from nlp.data_visualisation import WordCloudMaker
# needed
import torch
from tensorboardX import SummaryWriter
from transformers import AutoModel
import pandas as pd
from sklearn.model_selection import train_test_split
from nlp.feature_extraction import BertInput
from transformers import AdamW,get_linear_schedule_with_warmup
import numpy as np
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from focalLoss import FocalLoss2
from datetime import datetime
##################################### Functions ###########################################################
def getLoss(name, normedWeights, dic_cat_labels):
if name == "default":
return None
elif name == "crossentropy":
return nn.CrossEntropyLoss()
elif name == "crossentropyweighted":
return nn.CrossEntropyLoss(weight=normedWeights)
elif name == "focalloss":
return FocalLoss2(gamma=5.,alpha=0.25, num_class=len(dic_cat_labels))
def get_sentences_labels(df,text_column='text_clean',label_column='CAT',cat_labels=None):
dic_cat_labels = cat_labels if cat_labels is not None else {x:value for x,value in enumerate(df[label_column].unique())}
dic_labels_to_cat = {value:x for x,value in dic_cat_labels.items() }
#df[text_column]= df[text_column].map(lambda text_clean : re.sub('["#$%&()*+,-./:;<=>@[\]^_`{|}~\n\tâ\']', '', text_clean))
df2 = df[label_column].map(dic_labels_to_cat)
sentences = df[text_column].values
labels = df2.values.astype(int)
return sentences,labels,dic_cat_labels
def get_label_callback(dataset,idx):
return dataset[idx][3].item()
#################################### Inline code ##########################################################
def main (file, separator, col_text, model_name, test, col_label, loss, finetune, no_other=False):
writer = SummaryWriter('runs/test')
print("Begin: Current Time =", datetime.now().strftime("%H:%M:%S %d/%m/%Y"))
gc.collect()
print("finetune:", finetune, "start test with categories = ", col_label, " using loss = ", loss, " and sampling = ", test)
df = pd.read_csv(file,sep=separator, quotechar='"', dtype='str')
# df = pd.read_csv(file,sep=separator,lineterminator="\n")
print(f"Import de {file} : \nNombres d'instance : {len(df)} \n")
print(df[col_label].value_counts(), " \n")
df.drop(df[df[col_label] =="NotAnnotated"].index, inplace=True)
df.drop(df[df[col_label] =="<TOCOMPLETE>"].index, inplace=True)
if col_label=="SA2" and no_other:
df.drop(df[df[col_label] =="Autre"].index, inplace=True)
print(f"AFTER DROP: Nombres d'instance : {len(df)} \n")
# print(df.head(2))
df = df[0:50]
text_preprocessing = TextPreprocessing(df,col_text)
text_preprocessing.fit_transform()
df_train , df_test = train_test_split(df,random_state=1, test_size=0.2)
sentences_train,labels_train,dic_cat_labels=get_sentences_labels(df_train,text_column='processed_text',label_column=col_label)
n_class = len(dic_cat_labels)
sentences_test,labels_test,dic_cat_labels=get_sentences_labels(df_test,text_column='processed_text',label_column=col_label,cat_labels=dic_cat_labels)
n_class = n_class if n_class > len(dic_cat_labels) else len(dic_cat_labels)
print("Classes : " )
print(dic_cat_labels)
bert_input= BertInput(AutoTokenizer.from_pretrained(model_name))
X_train = bert_input.fit_transform(sentences_train)
X_test = bert_input.fit_transform(sentences_test)
print(dic_cat_labels)
# Use 90% for training and 10% for validation.
train_inputs, validation_inputs, train_labels, validation_labels,train_masks,validation_masks = train_test_split(X_train[0], labels_train,X_train[1],random_state=1, test_size=0.2)
# Do the same for the masks
test_inputs = X_test[0]
test_masks = X_test[1]
print(labels_test)
test_labels = np.argmax(np.array(labels_test), axis=0) #now I tried this following a stackoverflow suggestion
# Convert all inputs and labels into torch tensors, the required datatype
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
test_inputs = torch.tensor(test_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
test_labels = torch.tensor(test_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)
test_masks = torch.tensor(test_masks)
print("len(train_labels)=", len(train_labels))
batch_size = 1 #here it should be 16 but it is not working
# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs,train_masks,train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size,drop_last=True )
# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs,validation_masks ,validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)
# Create the DataLoader for our test set.
test_data = TensorDataset(test_inputs,test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
print("len(test_data[0])=", len(test_data[0]))
base_model = AutoModel.from_pretrained(model_name)
model = CRFBiLSTMModel(bert=base_model,n_class=n_class)
model.cpu()
# finetune the embedding while training
model.bert.embeddings.requires_grad = fine
optimizer = AdamW(model.parameters(),
lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
eps = 1e-8 # args.adam_epsilon - default is 1e-8.
)
epochs = 4
# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs
# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps = 0, # Default value in run_glue.py
num_training_steps = total_steps)
df_value = pd.DataFrame(train_labels).value_counts(sort=False)
normedWeights = [1 - (x / sum(df_value)) for x in df_value]
normedWeights = torch.FloatTensor(normedWeights).to('cpu')
loss_function = getLoss(loss, normedWeights, dic_cat_labels)
train = Train()
train.fit(model,train_dataloader,validation_dataloader,epochs,torch.device('cpu'),optimizer,scheduler,loss_function, writer)
# Train the model on your training data
for epoch in range(epochs):
for batch in train_dataloader:
# Clear gradients
optimizer.zero_grad()
# Forward pass
predicted_labels = model(batch)
print(len(predicted_labels))
predicted_labels = torch.tensor(predicted_labels)
true_labels = torch.tensor(batch[-1].to('cpu').numpy())
print("debug3", (predicted_labels), (true_labels))
loss = loss_function(predicted_labels, true_labels.view(len(true_labels), 1))
# Backward pass
loss.backward()
optimizer.step()
# Evaluate the trained model on your test data
with torch.no_grad():
all_predicted_labels = []
all_true_labels = []
for batch in validation_data:
predicted_labels = model(batch)
true_labels = batch.labels
all_predicted_labels.extend(predicted_labels)
all_true_labels.extend(true_labels)
print("end validation")
# Generate a classification report to evaluate the performance of your model
target_names = ["Negative", "Positive"]
print(classification_report(all_true_labels, all_predicted_labels, target_names=target_names))
model_names = ['camembert-base']
tests = ["Random Sampling"]
categories=["CAT"]
losses = ["crossentropy", "crossentropyweighted", "focalloss"]
finetune = [False, True]
for model_name in model_names:
print(model_name)
for test in tests:
for col_label in categories:
for loss in losses:
for fine in finetune:
main("../data/corpus1.csv", "\t", "text", model_name, test=test, loss = loss, col_label=col_label, finetune=fine, no_other=True)
My corpus is a normal csv I read it as a dataframe. It contains one column for the text and one column for the label. I have seven categories.
Can you help me with this?
Right now my model is :
BiLSTM -> Linear Layer (Hidden to tag) -> CRf Layer