I'm a novice learner in LLM area. And I want to finetune a LLM(7B params), named 'Moss', to make it have the ability to answer multiple choice questions. The pretrained model and pretrained tokenizer are provided by Huggingface. And I integrate LoRA into the model for reducing the parameters and finetuning quickly. But I found 'train_loss' is not convergent and 'validation_loss' doesn't change...... I have inspected my codes and look up for many informations but have no idea to make it correct. I would greatly appreciate it if anyone could tell me the problem in my code.
I copied the code here to process the data [I copied the BertForMultipleChoice to design MossForMultipleChoice] (https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py#L1620) [And here is the Moss model which I used] (https://huggingface.co/fnlp/moss-base-7b)
The code below is training script.
# %%
import os
import pdb
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
os.environ['HF_HOME'] = "/root/dataln/wuyan/hf/misc"
os.environ['HF_DATASETS_CACHE'] = "/root/dataln/wuyan/hf/datasets"
os.environ['TRANSFORMERS_CACHE'] = "/root/dataln/wuyan/hf/models"
from datasets import load_dataset
swag = load_dataset("swag", "regular")
# %%
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(
"/root/dataln/wuyan/moss-base-7b", trust_remote_code=True)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# %%
ending_names = ["ending0", "ending1", "ending2", "ending3"]
def preprocess_function(examples):
first_sentences = [[context] * 4 for context in examples["sent1"]]
question_headers = examples["sent2"]
second_sentences = [
[f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)
]
first_sentences = sum(first_sentences, [])
second_sentences = sum(second_sentences, [])
tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}
# %%
tokenized_swag = swag.map(preprocess_function, batched=True)
# %%
# %%
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch
import numpy as np
@dataclass
class DataCollatorForMultipleChoice:
"""
Data collator that will dynamically pad the inputs for multiple choice received.
"""
tokenizer: PreTrainedTokenizerBase
padding: Union[bool, str, PaddingStrategy] = True
max_length: Optional[int] = None
pad_to_multiple_of: Optional[int] = None
def __call__(self, features):
label_name = "label" if "label" in features[0].keys() else "labels"
labels = [feature.pop(label_name) for feature in features] #len(labels) == 4
batch_size = len(features)
num_choices = len(features[0]["input_ids"])
flattened_features = [
[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
]
flattened_features = sum(flattened_features, [])
batch = self.tokenizer.pad(
flattened_features,
padding=self.padding,
max_length=self.max_length,
pad_to_multiple_of=self.pad_to_multiple_of,
return_tensors="pt",
)
batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
batch["labels"] = torch.tensor(labels, dtype=torch.int64)
return batch
# %%
import evaluate
accuracy = evaluate.load("accuracy")
import numpy as np
def compute_metrics(eval_pred):
pdb.set_trace()
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return accuracy.compute(predictions=predictions, references=labels)
# %%
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer,AutoModelForCausalLM,AutoModel
from peft import LoraConfig, get_peft_model
# from deepspeed.runtime.zero.stage_1_and_2 import estimate_zero2_model_states_mem_needs_all_live
import loralib as lora
def print_trainable_parameters(model):
trainable_params = 0
all_param = 0
names = []
for name, param in model.named_parameters():
all_param += param.numel()
if param.requires_grad:
names.append(name)
trainable_params += param.numel()
print(
f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
)
with open("trainable_params.txt","w") as f:
f.write(str(names))
model = AutoModelForMultipleChoice.from_pretrained(
"/root/dataln/wuyan/moss-base-7b",trust_remote_code=True)
# estimate_zero2_model_states_mem_needs_all_live(model)
lora.mark_only_lora_as_trainable(model,bias="all")
model.classifier.weight.requires_grad_(True)
model.pooler.dense.weight.requires_grad_(True)
print_trainable_parameters(model)
# %%
from transformers.trainer import TrainerCallback
training_args = TrainingArguments(
output_dir="my_awesome_swag_model",
evaluation_strategy="steps",
logging_strategy="steps",
logging_steps=300,
save_strategy="steps",
save_steps=1000,
load_best_model_at_end=True,
learning_rate=1e-4,
per_device_train_batch_size=1,
per_device_eval_batch_size=1,
save_total_limit=5,
num_train_epochs=3,
weight_decay=0.01,
gradient_accumulation_steps=5,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_swag["train"],
eval_dataset=tokenized_swag["validation"].select(range(50)),
tokenizer=tokenizer,
data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
compute_metrics=compute_metrics,
)
trainer.train()
The code below is added into the 'modeling_moss.py'
class MossPooler(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size,config.hidden_size)
self.activation = nn.Tanh()
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
return pooled_output
class MossForMultipleChoice(MossPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.model = MossModel(config)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, 1)
self.pooler = MossPooler(config)
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(MOSS_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
# token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
# head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
`input_ids` above)
"""
# import pdb
# pdb.set_trace()
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
input_ids = input_ids.view(-1, input_ids.size(-1)
) if input_ids is not None else None
attention_mask = attention_mask.view(
-1, attention_mask.size(-1)) if attention_mask is not None else None
# token_type_ids = token_type_ids.view(
# -1, token_type_ids.size(-1)) if token_type_ids is not None else None
position_ids = position_ids.view(-1, position_ids.size(-1)
) if position_ids is not None else None
inputs_embeds = (
inputs_embeds.view(-1, inputs_embeds.size(-2),
inputs_embeds.size(-1))
if inputs_embeds is not None
else None
)
outputs = self.model(
input_ids,
attention_mask=attention_mask,
# token_type_ids=token_type_ids,
position_ids=position_ids,
past_key_values=past_key_values,
# head_mask=head_mask,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# print(np.sum(np.array(self.classifier.weight.cpu())))
# print(f'classifier_train:{self.classifier.weight.requires_grad},pooler_train:{self.pooler.dense.weight.requires_grad}')
# print(f'model.layers.25.mlp.up_proj_train:{self.model.layers[25].mlp.up_proj.weight.requires_grad}')
pooled_output = self.pooler(outputs[0])
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
# print(f'self.pooler.dense.weight: {self.pooler.dense.weight}')
# print(f'self.classifier.weight: {self.classifier.weight}')
reshaped_logits = logits.view(-1, num_choices)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
if not return_dict:
output = (reshaped_logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return MultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
The code below is config.json which has been altered by me.
{
"architectures": [
"MossForCausalLM",
"MossForMultipleChoice"
],
"auto_map": {
"AutoConfig": "configuration_moss.MossConfig",
"AutoModel": "modeling_moss.MossModel",
"AutoModelForCausalLM": "modeling_moss.MossForCausalLM",
"AutoModelForMultipleChoice":"modeling_moss.MossForMultipleChoice"
},
"bos_token_id": 1,
"eos_token_id": 2,
"hidden_act": "silu",
"hidden_size": 4096,
"initializer_range": 0.02,
"intermediate_size": 11008,
"max_position_embeddings": 2048,
"model_type": "moss",
"num_attention_heads": 32,
"num_hidden_layers": 32,
"pad_token_id": 0,
"rms_norm_eps": 1e-05,
"tie_word_embeddings": false,
"torch_dtype": "float16",
"transformers_version": "4.31.0.dev0",
"use_cache": true,
"vocab_size": 92494,
"classifier_dropout": 0.1
}
as you can see, there have problems absolutely. But I don't know why and I can't find any possible problem. I have examined the LoRA Layers and I'm sure the weights of LoRA Layers have updated. Regardless of whether my metric method is correct or not, I believe that the validation_loss will definitely change after these weights are updated, but it does not.