Foundation
I'm doing finetuning with GPTJForCausalLM. (pretrained is 6B and fp16).Environment
ubuntu 20.04 (nvidia-docker)
cuda(in docker) version is 11.4
RTX 3090 (24G VRAM)
torch.__version__ 1.12.1
transformers.__version__ 4.12.5
jupyter lab version : 3.5.0 python3 version : Python 3.7.13
pretrained model : (kakao-kogpt 6B 16fp) https://github.com/kakaobrain/kogptProblem
If I freeze some layer's parameters and doingmodel.forward()-loss.backward()-optim.step()
, then not freezed parameters are gone tonan
. Only first step enough to makenan
.Question
Why I can get this error?. Many of internet search result cannot help me.Bug reproduction Code
!git clone https://github.com/kakaobrain/kogpt.git
!pip install -r ./kogpt/requirements.txt
from transformers import GPTJForCausalLM
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import copy
import inspect
import torch.optim as optim
import types
import re
import numpy as np
#model github's official code, and it works well.
tokenizer = AutoTokenizer.from_pretrained(
'kakaobrain/kogpt', revision='KoGPT6B-ryan1.5b-float16', # or float32 version: revision=KoGPT6B-ryan1.5b
bos_token='[BOS]', eos_token='[EOS]', unk_token='[UNK]', pad_token='[PAD]', mask_token='[MASK]'
)
model = AutoModelForCausalLM.from_pretrained(
'kakaobrain/kogpt', revision='KoGPT6B-ryan1.5b-float16', # or float32 version: revision=KoGPT6B-ryan1.5b
pad_token_id=tokenizer.eos_token_id,
torch_dtype='auto', low_cpu_mem_usage=True
)
#for working test, I just freeze all layers but not freeze last layer.
ls = list(model.modules())
for i, m in enumerate(ls):
if i == len(ls) - 1:
#unfreeze for last layer
m.requires_grad_(True)
else:
#freeze other layers
m.requires_grad_(False)
optim = optim.AdamW(model.parameters(), lr=1e-5)
_ = model.cuda()
_ = model.train()
sample = ['Some text data for finetuning-1', 'Some text data for finetuning-2']
with torch.cuda.amp.autocast(): #this line is not affect change result.
#make batch
batch = tokenizer(sample, padding=True, truncation=True, max_length=64, return_tensors='pt')
batch = {k:v.cuda() for k, v in batch.items()}
#forward
out = model(**batch)
#loss
loss = F.cross_entropy(out.logits[:, :-1, :].flatten(0,-2),
batch['input_ids'][:,1:].flatten(),
reduction='mean')
print(loss.grad)
#None <- result
print(loss.is_leaf)
#False <- result
print(loss)
#tensor(6.6850, device='cuda:0', grad_fn=<NllLossBackward0>) <- result
print(list(model.parameters())[-1])
#Parameter containing: <- result
#tensor([-0.0454, 0.0815, -0.0442, ..., -0.0566, -0.0557, -0.0552],
# device='cuda:0', dtype=torch.float16, requires_grad=True)
loss.backward()
print(loss.grad)
#None <- print result
print(loss.is_leaf)
#False <- print result
print(list(model.parameters())[-1])
#Parameter containing: <- result
#tensor([-0.0454, 0.0815, -0.0442, ..., -0.0566, -0.0557, -0.0552],
# device='cuda:0', dtype=torch.float16, requires_grad=True)
optim.step()
print(list(model.parameters())[-1])
#Parameter containing: <- result, this is the problem point.
#tensor([ nan, 0.0815, nan, ..., nan, nan, nan], device='cuda:0',
# dtype=torch.float16, requires_grad=True)
- More Information
The work I really wanted to do is usingLoRA
downstream.
I did it first time with this code.
#(... same as section 5.)
#load model
model = AutoModelForCausalLM.from_pretrained(
'kakaobrain/kogpt', revision='KoGPT6B-ryan1.5b-float16', # or float32 version: revision=KoGPT6B-ryan1.5b
pad_token_id=tokenizer.eos_token_id,
torch_dtype='auto', low_cpu_mem_usage=True
)
#my lora adapter adder code, (refer to https://github.com/huggingface/transformers/issues/14839)
def forward_linear_with_adapter(self, input: torch.Tensor) -> torch.Tensor:
#replace NN.Linear's forward()
out = F.linear(input, self.weight, self.bias)
if self.lora_adapter:
out += self.lora_adapter(input)
return out
def AddLoRAtoLinear(layer, adapter_dim=16, _dtype=None):
#add adapter
dt = _dtype if _dtype else layer._parameters['weight'].dtype
layer.lora_adapter = nn.Sequential(
nn.Linear(layer.in_features, adapter_dim, bias=False, dtype=dt),
nn.Linear(adapter_dim, layer.out_features, bias=False, dtype=dt)
)
#make trainable
layer.lora_adapter.requires_grad_(True)
nn.init.zeros_(layer.lora_adapter[1].weight)
#bind forward with adapter
layer.forward = types.MethodType(forward_linear_with_adapter, layer)
def forward_embedding_with_adapter(self, input: torch.Tensor) -> torch.Tensor:
#replace NN.Embedding's forward()
out = F.embedding(input, self.weight, self.padding_idx, self.max_norm, self.norm_type, self.scale_grad_by_freq, self.sparse)
if self.lora_adapter:
out += self.lora_adapter(input)
return out
def AddLoRAtoEmbedding(layer, adapter_dim=16, _dtype=None):
dt = _dtype if _dtype else layer._parameters['weight'].dtype
#add adapter
layer.lora_adapter = nn.Sequential(
nn.Embedding(layer.num_embeddings, adapter_dim, dtype=dt),
nn.Linear(adapter_dim, layer.embedding_dim, bias=False, dtype=dt)
)
#make trainable
layer.lora_adapter.requires_grad_(True)
nn.init.zeros_(layer.lora_adapter[1].weight) #follow LoRA paper's
#bind forward with adapter
layer.forward = types.MethodType(forward_embedding_with_adapter, layer)
def MakeLoRA(model):
#freeze all other parameters.
for _ in model.parameters():
_.requires_grad_(False)
#apply LoRA only embedding & linear layers.
needchange = []
for module in model.modules():
if type(module) == nn.Linear or type(module) == nn.Embedding:
needchange.append(module)
for module in needchange:
if type(module) == nn.Linear:
#run custom LoRA attach function to this layer
AddLoRAtoLinear(module)
elif type(module) == nn.Embedding:
#run custom LoRA attach function to this layer
AddLoRAtoEmbedding(module)
return model
if False:
#instead of doing this(from code in section 5), do MakeLoRA()
ls = list(model.modules())
for i, m in enumerate(ls):
if i == len(ls) - 1:
#unfreeze for last layer
m.requires_grad_(True)
else:
#freeze other layers
m.requires_grad_(False)
else:
#change model to has LoRA
model = MakeLoRA(model)
#(and do it same as section 5... and make same nan error )