How to finetune a LLM to make it have ability to solve multiple choice problem

Question

I'm a novice learner in LLM area. And I want to finetune a LLM(7B params), named 'Moss', to make it have the ability to answer multiple choice questions. The pretrained model and pretrained tokenizer are provided by Huggingface. And I integrate LoRA into the model for reducing the parameters and finetuning quickly. But I found 'train_loss' is not convergent and 'validation_loss' doesn't change...... I have inspected my codes and look up for many informations but have no idea to make it correct. I would greatly appreciate it if anyone could tell me the problem in my code.

I copied the code here to process the data [I copied the BertForMultipleChoice to design MossForMultipleChoice] (https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py#L1620) [And here is the Moss model which I used] (https://huggingface.co/fnlp/moss-base-7b)

The code below is training script.

# %%
import os
import pdb
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
os.environ['HF_HOME'] = "/root/dataln/wuyan/hf/misc"
os.environ['HF_DATASETS_CACHE'] = "/root/dataln/wuyan/hf/datasets"
os.environ['TRANSFORMERS_CACHE'] = "/root/dataln/wuyan/hf/models"
from datasets import load_dataset
swag = load_dataset("swag", "regular")


# %%
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    "/root/dataln/wuyan/moss-base-7b", trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# %%
ending_names = ["ending0", "ending1", "ending2", "ending3"]


def preprocess_function(examples):
    first_sentences = [[context] * 4 for context in examples["sent1"]]
    question_headers = examples["sent2"]
    second_sentences = [
        [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)
    ]

    first_sentences = sum(first_sentences, [])
    second_sentences = sum(second_sentences, []) 

    tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
    return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}

# %%
tokenized_swag = swag.map(preprocess_function, batched=True)


# %%


# %%
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch
import numpy as np

@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features] #len(labels) == 4
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)

        return batch

# %%
import evaluate

accuracy = evaluate.load("accuracy")
import numpy as np


def compute_metrics(eval_pred):
    pdb.set_trace()
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

# %%
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer,AutoModelForCausalLM,AutoModel
from peft import LoraConfig, get_peft_model
# from deepspeed.runtime.zero.stage_1_and_2 import estimate_zero2_model_states_mem_needs_all_live
import loralib as lora
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    names = []
    for name, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            names.append(name)
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )
    with open("trainable_params.txt","w") as f:
        f.write(str(names))


model = AutoModelForMultipleChoice.from_pretrained(
    "/root/dataln/wuyan/moss-base-7b",trust_remote_code=True)
# estimate_zero2_model_states_mem_needs_all_live(model)
lora.mark_only_lora_as_trainable(model,bias="all")
model.classifier.weight.requires_grad_(True)
model.pooler.dense.weight.requires_grad_(True)
print_trainable_parameters(model)



# %%
from transformers.trainer import TrainerCallback

        



training_args = TrainingArguments(
    output_dir="my_awesome_swag_model",
    evaluation_strategy="steps",
    logging_strategy="steps",
    logging_steps=300,
    save_strategy="steps",
    save_steps=1000,
    load_best_model_at_end=True,
    learning_rate=1e-4,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    save_total_limit=5,
    num_train_epochs=3,
    weight_decay=0.01,
    gradient_accumulation_steps=5,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_swag["train"],
    eval_dataset=tokenized_swag["validation"].select(range(50)),
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

trainer.train()

The code below is added into the 'modeling_moss.py'


class MossPooler(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size,config.hidden_size)
        self.activation = nn.Tanh()
    
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor) 
        pooled_output = self.activation(pooled_output)
        return pooled_output

class MossForMultipleChoice(MossPreTrainedModel):

    def __init__(self, config):
        super().__init__(config)
        
        self.model = MossModel(config)
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)
        self.classifier = nn.Linear(config.hidden_size, 1)
        self.pooler = MossPooler(config)
        # Initialize weights and apply final processing
        self.post_init()

    @add_start_docstrings_to_model_forward(MOSS_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        # token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        # head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        use_cache: Optional[bool] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        """
        # import pdb
        # pdb.set_trace()
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
        input_ids = input_ids.view(-1, input_ids.size(-1)
                                   ) if input_ids is not None else None
        attention_mask = attention_mask.view(
            -1, attention_mask.size(-1)) if attention_mask is not None else None
        # token_type_ids = token_type_ids.view(
        #     -1, token_type_ids.size(-1)) if token_type_ids is not None else None
        position_ids = position_ids.view(-1, position_ids.size(-1)
                                         ) if position_ids is not None else None
        inputs_embeds = (
            inputs_embeds.view(-1, inputs_embeds.size(-2),
                               inputs_embeds.size(-1))
            if inputs_embeds is not None
            else None
        )

        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            # token_type_ids=token_type_ids,
            position_ids=position_ids,
            past_key_values=past_key_values,
            # head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        
        # print(np.sum(np.array(self.classifier.weight.cpu())))
        # print(f'classifier_train:{self.classifier.weight.requires_grad},pooler_train:{self.pooler.dense.weight.requires_grad}')
        # print(f'model.layers.25.mlp.up_proj_train:{self.model.layers[25].mlp.up_proj.weight.requires_grad}')
        pooled_output = self.pooler(outputs[0])
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        # print(f'self.pooler.dense.weight: {self.pooler.dense.weight}')
        # print(f'self.classifier.weight: {self.classifier.weight}')
        reshaped_logits = logits.view(-1, num_choices)
        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(reshaped_logits, labels)

        if not return_dict:
            output = (reshaped_logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output
        return MultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

The code below is config.json which has been altered by me.

{
  "architectures": [
    "MossForCausalLM",
    "MossForMultipleChoice"
  ],
  "auto_map": {
    "AutoConfig": "configuration_moss.MossConfig",
    "AutoModel": "modeling_moss.MossModel",
    "AutoModelForCausalLM": "modeling_moss.MossForCausalLM",
    "AutoModelForMultipleChoice":"modeling_moss.MossForMultipleChoice"
  },
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 2048,
  "model_type": "moss",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "pad_token_id": 0,
  "rms_norm_eps": 1e-05,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.31.0.dev0",
  "use_cache": true,
  "vocab_size": 92494,
  "classifier_dropout": 0.1
}

as you can see, there have problems absolutely. But I don't know why and I can't find any possible problem. I have examined the LoRA Layers and I'm sure the weights of LoRA Layers have updated. Regardless of whether my metric method is correct or not, I believe that the validation_loss will definitely change after these weights are updated, but it does not.

How to finetune a LLM to make it have ability to solve multiple choice problem

0 Answers0