0
  1. Foundation
    I'm doing finetuning with GPTJForCausalLM. (pretrained is 6B and fp16).

  2. Environment
    ubuntu 20.04 (nvidia-docker)
    cuda(in docker) version is 11.4
    RTX 3090 (24G VRAM)
    torch.__version__ 1.12.1
    transformers.__version__ 4.12.5
    jupyter lab version : 3.5.0 python3 version : Python 3.7.13
    pretrained model : (kakao-kogpt 6B 16fp) https://github.com/kakaobrain/kogpt

  3. Problem
    If I freeze some layer's parameters and doing model.forward()-loss.backward()-optim.step(), then not freezed parameters are gone to nan. Only first step enough to make nan.

  4. Question
    Why I can get this error?. Many of internet search result cannot help me.

  5. Bug reproduction Code

!git clone https://github.com/kakaobrain/kogpt.git
!pip install -r ./kogpt/requirements.txt

from transformers import GPTJForCausalLM
from transformers import AutoTokenizer, AutoModelForCausalLM 
import transformers
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import copy
import inspect
import torch.optim as optim
import types
import re
import numpy as np

#model github's official code, and it works well.
tokenizer = AutoTokenizer.from_pretrained(
      'kakaobrain/kogpt', revision='KoGPT6B-ryan1.5b-float16',  # or float32 version: revision=KoGPT6B-ryan1.5b
      bos_token='[BOS]', eos_token='[EOS]', unk_token='[UNK]', pad_token='[PAD]', mask_token='[MASK]'
    )

model = AutoModelForCausalLM.from_pretrained(
      'kakaobrain/kogpt', revision='KoGPT6B-ryan1.5b-float16',  # or float32 version: revision=KoGPT6B-ryan1.5b
      pad_token_id=tokenizer.eos_token_id,
      torch_dtype='auto', low_cpu_mem_usage=True
    )

#for working test, I just freeze all layers but not freeze last layer.
ls = list(model.modules())
for i, m in enumerate(ls):
    if i == len(ls) - 1:
        #unfreeze for last layer
        m.requires_grad_(True)
    else:
        #freeze other layers
        m.requires_grad_(False)

optim = optim.AdamW(model.parameters(), lr=1e-5)
_ = model.cuda()
_ = model.train()

sample = ['Some text data for finetuning-1', 'Some text data for finetuning-2']
with torch.cuda.amp.autocast(): #this line is not affect change result.
    #make batch
    batch = tokenizer(sample, padding=True, truncation=True, max_length=64, return_tensors='pt')
    batch = {k:v.cuda() for k, v in batch.items()}

    #forward
    out = model(**batch)

    #loss
    loss = F.cross_entropy(out.logits[:, :-1, :].flatten(0,-2), 
                       batch['input_ids'][:,1:].flatten(),
                       reduction='mean')


print(loss.grad)
#None <- result

print(loss.is_leaf)
#False <- result

print(loss)
#tensor(6.6850, device='cuda:0', grad_fn=<NllLossBackward0>) <- result

print(list(model.parameters())[-1])
#Parameter containing: <- result
#tensor([-0.0454,  0.0815, -0.0442,  ..., -0.0566, -0.0557, -0.0552],
#       device='cuda:0', dtype=torch.float16, requires_grad=True)


loss.backward()

print(loss.grad)
#None <- print result

print(loss.is_leaf)
#False <- print result

print(list(model.parameters())[-1])
#Parameter containing: <- result
#tensor([-0.0454,  0.0815, -0.0442,  ..., -0.0566, -0.0557, -0.0552],
#       device='cuda:0', dtype=torch.float16, requires_grad=True)

optim.step()

print(list(model.parameters())[-1])
#Parameter containing: <- result, this is the problem point.
#tensor([   nan, 0.0815,    nan,  ...,    nan,    nan,    nan], device='cuda:0',
#       dtype=torch.float16, requires_grad=True)
  1. More Information
    The work I really wanted to do is using LoRA downstream.
    I did it first time with this code.
#(... same as section 5.)

#load model
model = AutoModelForCausalLM.from_pretrained(
      'kakaobrain/kogpt', revision='KoGPT6B-ryan1.5b-float16',  # or float32 version: revision=KoGPT6B-ryan1.5b
      pad_token_id=tokenizer.eos_token_id,
      torch_dtype='auto', low_cpu_mem_usage=True
    )

#my lora adapter adder code, (refer to https://github.com/huggingface/transformers/issues/14839)
def forward_linear_with_adapter(self, input: torch.Tensor) -> torch.Tensor:
    #replace NN.Linear's forward()
    out = F.linear(input, self.weight, self.bias)
    if self.lora_adapter:
        out += self.lora_adapter(input)
    return out

def AddLoRAtoLinear(layer, adapter_dim=16, _dtype=None):
    #add adapter
    dt = _dtype if _dtype else layer._parameters['weight'].dtype
    layer.lora_adapter = nn.Sequential(
        nn.Linear(layer.in_features, adapter_dim, bias=False, dtype=dt),
        nn.Linear(adapter_dim, layer.out_features, bias=False, dtype=dt)
    )
    #make trainable
    layer.lora_adapter.requires_grad_(True)
    nn.init.zeros_(layer.lora_adapter[1].weight)
    
    #bind forward with adapter
    layer.forward = types.MethodType(forward_linear_with_adapter, layer)
    
def forward_embedding_with_adapter(self, input: torch.Tensor) -> torch.Tensor:
    #replace NN.Embedding's forward()
    out = F.embedding(input, self.weight, self.padding_idx, self.max_norm, self.norm_type, self.scale_grad_by_freq, self.sparse)
    if self.lora_adapter:
        out += self.lora_adapter(input)
    return out

def AddLoRAtoEmbedding(layer, adapter_dim=16, _dtype=None):
    dt = _dtype if _dtype else layer._parameters['weight'].dtype
    #add adapter
    layer.lora_adapter = nn.Sequential(
        nn.Embedding(layer.num_embeddings, adapter_dim, dtype=dt),
        nn.Linear(adapter_dim, layer.embedding_dim, bias=False, dtype=dt)
    )
    #make trainable
    layer.lora_adapter.requires_grad_(True)
    nn.init.zeros_(layer.lora_adapter[1].weight) #follow LoRA paper's
    
    #bind forward with adapter
    layer.forward = types.MethodType(forward_embedding_with_adapter, layer)

def MakeLoRA(model):
    #freeze all other parameters.
    for _ in model.parameters():
        _.requires_grad_(False)
    
    #apply LoRA only embedding & linear layers.
    needchange = []
    for module in model.modules():
        if type(module) == nn.Linear or type(module) == nn.Embedding:
            needchange.append(module)

    for module in needchange:
        if type(module) == nn.Linear:
            #run custom LoRA attach function to this layer
            AddLoRAtoLinear(module)
        elif type(module) == nn.Embedding:
            #run custom LoRA attach function to this layer
            AddLoRAtoEmbedding(module)
    return model



if False:
  #instead of doing this(from code in section 5), do MakeLoRA()
  ls = list(model.modules())
  for i, m in enumerate(ls):
      if i == len(ls) - 1:
          #unfreeze for last layer
          m.requires_grad_(True)
      else:
          #freeze other layers
          m.requires_grad_(False)
else:
  #change model to has LoRA
  model = MakeLoRA(model)


#(and do it same as section 5... and make same nan error )
user1101221
  • 394
  • 6
  • 21

0 Answers0