BiLSTM-CRF for text classification in PYTORCH

Question

I have been having trouble with the bi-lastm-cfr model. I tried several fixes for different bugs but now i am stuck.

from transformers import AutoTokenizer, AutoModel
import torch.nn as nn
import torch
import numpy as np
from torchcrf import CRF

class CRFBiLSTMModel(nn.Module):
    def __init__(self, n_class, bert):
        super(CRFBiLSTMModel, self).__init__()

        # Load the pre-trained tokenizer and model
        # self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        # self.model = AutoModel.from_pretrained(model_name)
        self.bert = bert
        self.n_class = n_class
        self.dropout_rate = 0.2
        self.lstm_hidden_size = self.bert.config.hidden_size
        # Add a Bi-LSTM layer
        self.lstm = nn.LSTM(
            input_size=self.bert.config.hidden_size,
            hidden_size=self.lstm_hidden_size,
            num_layers=2,
            batch_first=True,
            # dropout=0.3,
            bidirectional=True,
        )
        # self.lstm = nn.LSTM(self.lstm_hidden_size,
        #                     self.lstm_hidden_size, bidirectional=True)

        # Add a linear layer/classifier
        # self.linear = nn.Linear(256, n_class)
        self.linear = nn.Linear(self.lstm_hidden_size * 2, self.n_class, bias=True)
        # self.dropout = nn.Dropout(p=self.dropout_rate)

        # Add a CRF layer
        self.crf = CRF(n_class)

    def forward(self, batch):

        # Get the model embeddings
        b_input_ids = batch[0]
        b_input_mask = batch[1]

        outputs = self.bert(b_input_ids, attention_mask=b_input_mask)
        embeddings = outputs.last_hidden_state

        # Add a mask to the input sequence
        mask = b_input_mask.unsqueeze(-1).repeat(1, 1, self.lstm_hidden_size * 2)

        # Convert mask to binary mask
        mask = mask.byte()
        # print("debug", mask[0].shape)
        # # Check if the first timestep has a zero mask value, and if so, set it to 1
        # if not (mask[0].all()):
        #     mask.fill_(1)

        # Pass the embeddings through the Bi-LSTM layer
        lstm_outputs, _ = self.lstm(embeddings)

        # Apply the mask to the LSTM outputs
        masked_lstm_outputs = lstm_outputs.masked_fill(mask == 1, float('-inf'))

        # Pass the masked LSTM outputs through the linear layer
        logits = self.linear(masked_lstm_outputs)
        logits = torch.nan_to_num(logits, nan=1.0)
        # print("debug logits", logits)

        # Apply the CRF layer
        try:
            predicted_labels = self.crf.decode(logits, b_input_mask)
            print("debug predicted_labels 1", predicted_labels)
            return (predicted_labels,)
        except ValueError as error:
            if str(error) == "mask of the first timestep must all be on":
                print("hello")
                b_input_mask[0].fill_(1)
                predicted_labels = self.crf.decode(logits, b_input_mask.bool())
                # predicted_labels = np.array([np.array(x) for x in predicted_labels])
                print("debug predicted_labelsv2", predicted_labels)
                self.labels = predicted_labels
                return (predicted_labels,)
            else:
                print("debug: else crf_bilstm.py line 80")
                exit()

class Train:
    def format_time(self, elapsed):
        '''
        Takes a time in seconds and returns a string hh:mm:ss
        '''
        # Round to the nearest second.
        elapsed_rounded = int(round((elapsed)))

        # Format as hh:mm:ss
        return str(datetime.timedelta(seconds=elapsed_rounded))

    def fit(self, model, train_dataloader, validation_dataloader, epochs, device, optimizer, scheduler, criterion, writer, print_each=40):
        # Set the seed value all ovr the place to make this reproducible.
        seed_val = 2

        random.seed(seed_val)
        np.random.seed(seed_val)
        torch.manual_seed(seed_val)
        torch.cuda.manual_seed_all(seed_val)
        model_save_path = 'tmp'
        loss_values = []
        hist_valid_scores = []

        # For each epoch...
        for epoch_i in range(0, epochs):
            logs = {}

            # ========================================
            #               Training
            # ========================================

            # Perform one full pass over the training set.

            print("")
            print(
                '======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
            print('Training...')

            # Measure how long the training epoch takes.
            t0 = time.time()

            # Reset the total loss for this epoch.
            total_loss = 0
            total_accuracy = 0

            # Put the model into training mode. Don't be mislead--the call to
            # `train` just changes the *mode*, it doesn't *perform* the training.
            model.train()

            # For each batch of training data...
            for step, batch in enumerate(train_dataloader):

                # Progress update every 40 batches.
                if step % print_each == 0 and not step == 0:
                    # Calculate elapsed time in minutes.
                    elapsed = self.format_time(time.time() - t0)

                    # Report progress.
                    print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(
                        step, len(train_dataloader), elapsed))

                # move batch data to device (cpu or gpu)
                batch = tuple(t.to(device) for t in batch)

                # `batch` contains three pytorch tensors:
                #   [0]: input ids
                #   [1]: attention masks
                #   [2]: labels

                model.zero_grad()
                outputs = model(batch)

                # The call to `model` always returns a tuple, so we need to pull the
                # loss value out of the tuple.
                logits = outputs[0]
                label_ids = batch[-1]
                # print(logits)
                if isinstance(logits, list):
                    logits = torch.FloatTensor(logits[0])
                    label_ids = np.array([np.array(x) for x in label_ids])
                    label_ids = torch.FloatTensor(label_ids)#stoped here because criteroin expect float but this is long
                    print("kjgkjgjg", len(label_ids), logits.view(-1),
                                    label_ids[-1])
                    loss = criterion(logits.view(-1),
                                    label_ids.view(-1))
                    # logits = np.argmax(logits, axis=1)
                    # label_ids = np.argmax(label_ids, axis=1)
                else:
                    loss = criterion(logits.view(-1, model.n_class),
                                    label_ids.view(-1))

                # Move logits back to cpu for metrics calculations
                logits = logits.detach().cpu().numpy()
                label_ids = label_ids.to('cpu').numpy()

                # Calculate the accuracy for this batch of test sentences.
                print(logits, label_ids)
                
                current_accuracy = flat_accuracy(logits, label_ids)
                total_accuracy += current_accuracy
                # Accumulate the training loss over all of the batches so that we can
                # calculate the average loss at the end. `loss` is a Tensor containing a
                # single value; the `.item()` function just returns the Python value
                # from the tensor.
                total_loss += loss.item()
                writer.add_scalar('training loss',
                                  loss.item(),
                                  epoch_i * len(train_dataloader)+step)
                writer.add_scalar('training Accuracy',
                                  current_accuracy,
                                  epoch_i * len(train_dataloader)+step)

                # Perform a backward pass to calculate the gradients.
                loss.backward()

                # Clip the norm of the gradients to 1.0.
                # This is to help prevent the "exploding gradients" problem.
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

                # Update parameters and take a step using the computed gradient.
                # The optimizer dictates the "update rule"--how the parameters are
                # modified based on their gradients, the learning rate, etc.
                optimizer.step()

                # Update the learning rate.
                scheduler.step()

            # Calculate the average loss over the training data.
            avg_train_loss = total_loss / len(train_dataloader)
            avg_train_accuracy = total_accuracy / len(train_dataloader)
            # Store the loss value for plotting the learning curve.
            loss_values.append(avg_train_loss)
            logs["log loss"] = avg_train_loss
            logs["accuracy"] = avg_train_accuracy

            print("")
            print("  Average training loss: {0:.2f}".format(avg_train_loss))
            print("  Training epcoh took: {:}".format(
                self.format_time(time.time() - t0)))

            # ========================================
            #               Validation
            # ========================================
            # After the completion of each training epoch, measure our performance on
            # our validation set.

            print("")
            print("Running Validation...")

            t0 = time.time()

            # Put the model in evaluation mode--the dropout layers behave differently
            # during evaluation.
            model.eval()

            # Tracking variables
            eval_loss, eval_accuracy, eval_f1, eval_recall, eval_precesion = 0, 0, 0, 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0
            validation_loss = 0
            # Evaluate data for one epoch
            for step_valid, batch in enumerate(validation_dataloader):

                # Add batch to GPU
                batch = tuple(t.to(device) for t in batch)

                # Unpack the inputs from our dataloader

                # Telling the model not to compute or store gradients, saving memory and
                # speeding up validation
                with torch.no_grad():

                    # Forward pass, calculate logit predictions.
                    # This will return the logits rather than the loss because we have
                    # not provided labels.
                    # token_type_ids is the same as the "segment ids", which
                    # differentiates sentence 1 and 2 in 2-sentence tasks.
                    # The documentation for this `model` function is here:
                    model.eval()
                    # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
                    outputs = model(batch)

                # Get the "logits" output by the model. The "logits" are the output
                # values prior to applying an activation function like the softmax.

                logits = outputs[0]
                label_ids = batch[-1]

                validation_loss += criterion(logits.view(-1,
                                                         model.n_class), label_ids.view(-1))

                # print(logits)

                # Move logits and labels to CPU
                logits = logits.detach().cpu().numpy()
                label_ids = label_ids.to('cpu').numpy()

                # Calculate the accuracy for this batch of test sentences.
                tmp_eval_accuracy = flat_accuracy(logits, label_ids)
                tmp_eval_f1 = flat_f1(logits, label_ids)
                tmp_eval_recall = flat_recall(logits, label_ids)
                tmp_eval_precision = flat_precision(logits, label_ids)
                # Accumulate the total scores.
                eval_accuracy += tmp_eval_accuracy
                eval_f1 += tmp_eval_f1
                eval_recall += tmp_eval_recall
                eval_precesion += tmp_eval_precision

                # Track the number of batches
                nb_eval_steps += 1
                validation_loss = validation_loss/len(validation_dataloader)
                writer.add_scalar('validation Accuracy',
                                  tmp_eval_accuracy,
                                  epoch_i * len(validation_dataloader)+step_valid)
                writer.add_scalar('validation F1',
                                  tmp_eval_f1,
                                  epoch_i * len(validation_dataloader)+step_valid)
                writer.add_scalar('validation recall',
                                  tmp_eval_recall,
                                  epoch_i * len(validation_dataloader)+step_valid)
                writer.add_scalar('validation precesion',
                                  tmp_eval_precision,
                                  epoch_i * len(validation_dataloader)+step_valid)
            is_better = len(hist_valid_scores) == 0 or validation_loss < min(
                hist_valid_scores)
            hist_valid_scores.append(validation_loss)

            if is_better:
                patience = 0
                print(
                    'save currently the best model to [%s]' % model_save_path, file=sys.stderr)
                model.save(model_save_path)
                # also save the optimizers' state
                torch.save(optimizer.state_dict(), model_save_path + '.optim')
            elif patience < 5:
                patience += 1
                '''print('hit patience %d' % patience, file=sys.stderr)
            if patience == int(5):
                # decay lr, and restore from previously best checkpoint
                print('load previously best model and decay learning rate to ', file=sys.stderr)
                # load model
                params = torch.load(model_save_path, map_location=lambda storage, loc: storage)
                model.load_state_dict(params['state_dict'])
                model = model.to(torch.device("cuda"))
                print('restore parameters of the optimizers', file=sys.stderr)
                optimizer.load_state_dict(torch.load(model_save_path + '.optim'))
                # reset patience
                patience = 0'''
            # Report the final accuracy for this validation run.
            print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
            print("  F1: {0:.2f}".format(eval_f1/nb_eval_steps))
            print("  Recall: {0:.2f}".format(eval_recall/nb_eval_steps))
            print("  Precision: {0:.2f}".format(eval_precesion/nb_eval_steps))
            print("  Validation took: {:}".format(
                self.format_time(time.time() - t0)))
        return (eval_accuracy/nb_eval_steps, eval_f1/nb_eval_steps, eval_recall/nb_eval_steps, eval_precesion/nb_eval_steps,)

import sys
sys.path.append('../library')
import warnings
warnings.filterwarnings('ignore')
import gc 
from crf_bilstm import CRFBiLSTMModel

from transformers import BertModel,BertTokenizer,FlaubertTokenizer, FlaubertModel,AutoTokenizer, BertForSequenceClassification , FlaubertForSequenceClassification
from transformers.modeling_utils import SequenceSummary

from torch import nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import re
from nlp.models import BasicBertForClassification
from nlp.training import Train
from nlp.preprocessing import TextPreprocessing
from nlp.feature_extraction import MetaFeaturesExtraction
from nlp.data_visualisation import WordCloudMaker
# needed

import torch
from tensorboardX import SummaryWriter
from transformers import  AutoModel
import pandas as pd
from sklearn.model_selection import train_test_split
from nlp.feature_extraction import BertInput
from transformers import AdamW,get_linear_schedule_with_warmup
import numpy as np
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from focalLoss import FocalLoss2
from datetime import datetime

##################################### Functions ###########################################################
def getLoss(name, normedWeights, dic_cat_labels):
    if name == "default":
        return None
    elif name == "crossentropy":
        return nn.CrossEntropyLoss()
    elif name == "crossentropyweighted":
        return nn.CrossEntropyLoss(weight=normedWeights)
    elif name == "focalloss":
        return FocalLoss2(gamma=5.,alpha=0.25, num_class=len(dic_cat_labels))

def get_sentences_labels(df,text_column='text_clean',label_column='CAT',cat_labels=None):
    dic_cat_labels = cat_labels if cat_labels is not None else {x:value for x,value in enumerate(df[label_column].unique())}
    dic_labels_to_cat = {value:x for x,value in dic_cat_labels.items() }
    #df[text_column]= df[text_column].map(lambda text_clean : re.sub('["#$%&()*+,-./:;<=>@[\]^_`{|}~\n\t’\']', '', text_clean))
    df2 = df[label_column].map(dic_labels_to_cat)
    sentences = df[text_column].values
    labels = df2.values.astype(int)
    return sentences,labels,dic_cat_labels
def get_label_callback(dataset,idx):
    return dataset[idx][3].item()
#################################### Inline code ##########################################################
def main (file, separator, col_text, model_name, test, col_label, loss, finetune, no_other=False):
    writer = SummaryWriter('runs/test')
    print("Begin: Current Time =", datetime.now().strftime("%H:%M:%S %d/%m/%Y"))
    gc.collect()
    print("finetune:", finetune, "start test with categories = ", col_label, " using loss = ", loss, " and sampling = ", test)
    df = pd.read_csv(file,sep=separator, quotechar='"', dtype='str')
    # df = pd.read_csv(file,sep=separator,lineterminator="\n")
    print(f"Import de {file} : \nNombres d'instance :  {len(df)} \n")
    print(df[col_label].value_counts(), " \n")
    df.drop(df[df[col_label] =="NotAnnotated"].index, inplace=True)
    df.drop(df[df[col_label] =="<TOCOMPLETE>"].index, inplace=True)
    if col_label=="SA2" and no_other:
        df.drop(df[df[col_label] =="Autre"].index, inplace=True)
    print(f"AFTER DROP: Nombres d'instance :  {len(df)} \n")
    # print(df.head(2))
    df = df[0:50]

    text_preprocessing = TextPreprocessing(df,col_text)
    text_preprocessing.fit_transform()

    df_train , df_test = train_test_split(df,random_state=1, test_size=0.2)

    sentences_train,labels_train,dic_cat_labels=get_sentences_labels(df_train,text_column='processed_text',label_column=col_label)
    n_class = len(dic_cat_labels)
    sentences_test,labels_test,dic_cat_labels=get_sentences_labels(df_test,text_column='processed_text',label_column=col_label,cat_labels=dic_cat_labels)

    n_class = n_class if n_class >  len(dic_cat_labels) else len(dic_cat_labels)

    print("Classes : " )
    print(dic_cat_labels)

    bert_input= BertInput(AutoTokenizer.from_pretrained(model_name))

    X_train = bert_input.fit_transform(sentences_train)
    X_test = bert_input.fit_transform(sentences_test)
    print(dic_cat_labels)

    # Use 90% for training and 10% for validation.
    train_inputs, validation_inputs, train_labels, validation_labels,train_masks,validation_masks = train_test_split(X_train[0], labels_train,X_train[1],random_state=1, test_size=0.2)

    # Do the same for the masks
    test_inputs = X_test[0]
    test_masks = X_test[1]
    print(labels_test)
    test_labels = np.argmax(np.array(labels_test), axis=0) #now I tried this following a stackoverflow suggestion


    # Convert all inputs and labels into torch tensors, the required datatype 
    train_inputs = torch.tensor(train_inputs)
    validation_inputs = torch.tensor(validation_inputs)
    test_inputs = torch.tensor(test_inputs)

    train_labels = torch.tensor(train_labels)
    validation_labels = torch.tensor(validation_labels)
    test_labels = torch.tensor(test_labels)


    train_masks = torch.tensor(train_masks)
    validation_masks = torch.tensor(validation_masks)
    test_masks = torch.tensor(test_masks)

    print("len(train_labels)=", len(train_labels))
    batch_size = 1 #here it should be 16 but it is not working

    # Create the DataLoader for our training set.
    train_data = TensorDataset(train_inputs,train_masks,train_labels)
    train_sampler = RandomSampler(train_data)

    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size,drop_last=True )

    # Create the DataLoader for our validation set.
    validation_data = TensorDataset(validation_inputs,validation_masks ,validation_labels)
    validation_sampler = SequentialSampler(validation_data)
    validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


    # Create the DataLoader for our test set.
    test_data = TensorDataset(test_inputs,test_masks, test_labels)
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

    print("len(test_data[0])=", len(test_data[0]))
                
    base_model = AutoModel.from_pretrained(model_name)
    model = CRFBiLSTMModel(bert=base_model,n_class=n_class)
    model.cpu()

    # finetune the embedding while training
    model.bert.embeddings.requires_grad = fine

    optimizer = AdamW(model.parameters(),
                    lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                    eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                    )

    epochs = 4

    # Total number of training steps is number of batches * number of epochs.
    total_steps = len(train_dataloader) * epochs 

    # Create the learning rate scheduler.
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps = 0, # Default value in run_glue.py
                                                num_training_steps = total_steps)

    df_value = pd.DataFrame(train_labels).value_counts(sort=False)
    normedWeights = [1 - (x / sum(df_value)) for x in df_value]
    normedWeights = torch.FloatTensor(normedWeights).to('cpu')
    loss_function = getLoss(loss, normedWeights, dic_cat_labels)

    train = Train()
    train.fit(model,train_dataloader,validation_dataloader,epochs,torch.device('cpu'),optimizer,scheduler,loss_function, writer)
    

    # Train the model on your training data
    for epoch in range(epochs):
        for batch in train_dataloader:
            # Clear gradients
            optimizer.zero_grad()
            
            # Forward pass
            predicted_labels = model(batch)
            print(len(predicted_labels))
            predicted_labels = torch.tensor(predicted_labels)
            true_labels = torch.tensor(batch[-1].to('cpu').numpy())
            print("debug3", (predicted_labels), (true_labels))
            loss = loss_function(predicted_labels, true_labels.view(len(true_labels), 1))
            
            # Backward pass
            loss.backward()
            optimizer.step()

    # Evaluate the trained model on your test data
    with torch.no_grad():
        all_predicted_labels = []
        all_true_labels = []
        for batch in validation_data:
            predicted_labels = model(batch)
            true_labels = batch.labels
            all_predicted_labels.extend(predicted_labels)
            all_true_labels.extend(true_labels)
    print("end validation")
    # Generate a classification report to evaluate the performance of your model
    target_names = ["Negative", "Positive"]
    print(classification_report(all_true_labels, all_predicted_labels, target_names=target_names))


model_names = ['camembert-base']
tests = ["Random Sampling"]
categories=["CAT"]
losses = ["crossentropy", "crossentropyweighted", "focalloss"]
finetune = [False, True]
for model_name in model_names:
    print(model_name)
    for test in tests:
        for col_label in categories:
            for loss in losses:
                for fine in finetune:
                    main("../data/corpus1.csv", "\t", "text", model_name, test=test, loss = loss, col_label=col_label, finetune=fine, no_other=True)

My corpus is a normal csv I read it as a dataframe. It contains one column for the text and one column for the label. I have seven categories.

Can you help me with this?

Right now my model is :

BiLSTM -> Linear Layer (Hidden to tag) -> CRf Layer

How does it behave when you try to run it? Could you describe what part you feel stuck on? — Brock Brown, May 04 '23 at 17:51
line 151, in main test_data = TensorDataset(test_inputs,test_masks, test_labels) lib/python3.10/site-packages/torch/utils/data/dataset.py", line 189, in assert all(tensors[0].size(0) == tensor.size(0) for tensor in tensors), "Size mismatch between tensors" IndexError: Dimension specified as 0 but tensor has no dimensions this is hapening because "test_labels = np.argmax(np.array(labels_test), axis=0)" I added this line because of: current_accuracy = flat_accuracy(logits, label_ids) numpy/core/fromnumeric.py numpy.AxisError: axis 1 is out of bounds for array of dimension 1 — leila, May 04 '23 at 18:57

BiLSTM-CRF for text classification in PYTORCH

0 Answers0