Large Language Model Runs out of memory no matter which Hyperparameters I change - GeForce GTX 3060Ti

Question

I have been trying to fix this for a few days. The problem is that it runs out of memory because my training data is quite large. I have a system implemented to take pieces of the code by manually entering the location of the training data chunks. The problem is, no matter what changes are made to the hyperparameters it won't process because the GPU runs out of memory.

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import re
import sys
import os

# hyperparameters
batch_size = 8  # Increase batch size to utilize more GPU memory
block_size = 128  # Increase block size to capture longer dependencies
max_iters = 1000  # Increase the number of training iterations
eval_interval = 100  # Evaluate the loss every 100 iterations
learning_rate = 5e-4  # Slightly lower learning rate for larger batch size
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 20  # Increase the embedding dimension for more model capacity
n_head = 4  # Increase the number of attention heads
n_layer = 20  # Increase the number of layers for a deeper model
dropout = 0.0

torch.manual_seed(1337)
# Initialize the model

# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('F:/LLM Homebrew/UTF8_webtext/urlsf_subset00-1_data', 'r', encoding='utf-8') as f:
    text = f.read()

# Split the text into words
words = re.findall(r'\w+|[^\w\s]', text)

# Here are all the unique words that occur in this text
vocab = sorted(list(set(words)))
vocab_size = len(vocab)

# Create a mapping from words to integers
stoi = {word: i for i, word in enumerate(vocab)}
itos = {i: word for i, word in enumerate(vocab)}
encode = lambda s: [stoi[word] for word in s]  # encoder: take a list of words, output a list of integers
decode = lambda l: ' '.join([itos[i] for i in l])  # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(words), dtype=torch.long)
n = int(0.9 * len(data))  # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

class CustomDataset(Dataset):
    def __init__(self, data, block_size):
        self.data = data
        self.block_size = block_size

    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        return self.data[idx:idx + self.block_size]

# data loading
custom_dataset = CustomDataset(train_data, block_size)
data_loader = DataLoader(custom_dataset, batch_size=batch_size, shuffle=True)

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i + block_size] for i in ix])
    y = torch.stack([data[i + 1:i + block_size + 1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)  # (B,T,C)
        q = self.query(x)  # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2, -1) * C ** -0.5  # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))  # (B, T, T)
        wei = F.softmax(wei, dim=-1)  # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x)  # (B,T,C)
        out = wei @ v  # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedForward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)  # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx)  # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # (T,C)
        x = tok_emb + pos_emb  # (B,T,C)
        x = self.blocks(x)  # (B,T,C)
        x = self.ln_f(x)  # (B,T,C)
        logits = self.lm_head(x)  # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :]  # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)  # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)
        return idx

# Save and load model functions
def save_model(model, path):
    torch.save(model.state_dict(), path)

def load_model(model, path):
    model.load_state_dict(torch.load(path))
    model.eval()

model = BigramLanguageModel()
model = model.to(device)

def main(file_path, additional_iters):
    # Read the text from the provided file
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()

    # Check if the pre-trained model exists
    pre_trained_model_path = 'bigram_model.pt'
    if os.path.exists(pre_trained_model_path):
        # Load the pre-trained model
        load_model(model, pre_trained_model_path)
        print("Pre-trained model loaded.")
    else:
        print("Training the model...")
        # create a PyTorch optimizer
        # Training Loop with Gradient Accumulation
        accumulation_steps = 8
        optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

        for iter in range(max_iters):
            # every once in a while evaluate the loss on train and val sets
            if iter % eval_interval == 0 or iter == max_iters - 1:
                losses = estimate_loss()
                print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

            total_loss = 0.0
            for i, xb in enumerate(data_loader):
                xb = xb.to(device)
                yb = torch.cat((xb[:, 1:], xb[:, 0].unsqueeze(1)), dim=1)  # Shift by one for autoregressive prediction

                # evaluate the loss
                logits, loss = model(xb, yb)
                loss /= accumulation_steps
                total_loss += loss.item()

                if (i + 1) % accumulation_steps == 0:
                    loss.backward()
                    optimizer.step()
                    optimizer.zero_grad(set_to_none=True)

            total_loss /= len(data_loader)
            print(f"Iteration {iter}: average training loss {total_loss:.4f}")

            # Save the trained model after each training iteration
            save_model(model, pre_trained_model_path)
            print("Model saved.")

    # Additional training iterations
    if additional_iters > 0:
        print(f"Continuing training for {additional_iters} additional iterations...")
        for iter in range(additional_iters):
            total_loss = 0.0
            for i, xb in enumerate(data_loader):
                xb = xb.to(device)
                yb = torch.cat((xb[:, 1:], xb[:, 0].unsqueeze(1)), dim=1)  # Shift by one for autoregressive prediction

                # evaluate the loss
                logits, loss = model(xb, yb)
                loss /= accumulation_steps
                total_loss += loss.item()

                if (i + 1) % accumulation_steps == 0:
                    loss.backward()
                    optimizer.step()
                    optimizer.zero_grad(set_to_none=True)

            total_loss /= len(data_loader)
            print(f"Additional Iteration {iter + 1}: average training loss {total_loss:.4f}")

            # Save the trained model after each additional training iteration
            save_model(model, pre_trained_model_path)
            print("Model saved.")

        print(f"Additional {additional_iters} iterations completed.")

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: python llm4.py <file_path> <additional_iters>")
    else:
        file_path = sys.argv[1]
        additional_iters = int(sys.argv[2])
        main(file_path, additional_iters)

Upon further experimentation, the problem is not the size of the training data. I made a very small sample and the GPU still runs out of memory. — Auto, Jul 29 '23 at 23:37

score 1 · Answer 1 · answered Jul 30 '23 at 00:00

1

I figured out the problem. The issue was that the file is used on the command line and I input the incorrect filename for the version I was using. It was repeatedly using a model that doesn't fit in the GPU.

answered Jul 30 '23 at 00:00

Auto

59
1
1
6

Glad you solved the problem! You may want to close your question as "not reproducible, or caused by a typo" if the issue was due to using the wrong file name somewhere outside of the code you included. – Quack E. Duck Aug 03 '23 at 03:57

Large Language Model Runs out of memory no matter which Hyperparameters I change - GeForce GTX 3060Ti

1 Answers1