I have been trying to fix this for a few days. The problem is that it runs out of memory because my training data is quite large. I have a system implemented to take pieces of the code by manually entering the location of the training data chunks. The problem is, no matter what changes are made to the hyperparameters it won't process because the GPU runs out of memory.
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import re
import sys
import os
# hyperparameters
batch_size = 8 # Increase batch size to utilize more GPU memory
block_size = 128 # Increase block size to capture longer dependencies
max_iters = 1000 # Increase the number of training iterations
eval_interval = 100 # Evaluate the loss every 100 iterations
learning_rate = 5e-4 # Slightly lower learning rate for larger batch size
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 20 # Increase the embedding dimension for more model capacity
n_head = 4 # Increase the number of attention heads
n_layer = 20 # Increase the number of layers for a deeper model
dropout = 0.0
torch.manual_seed(1337)
# Initialize the model
# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('F:/LLM Homebrew/UTF8_webtext/urlsf_subset00-1_data', 'r', encoding='utf-8') as f:
text = f.read()
# Split the text into words
words = re.findall(r'\w+|[^\w\s]', text)
# Here are all the unique words that occur in this text
vocab = sorted(list(set(words)))
vocab_size = len(vocab)
# Create a mapping from words to integers
stoi = {word: i for i, word in enumerate(vocab)}
itos = {i: word for i, word in enumerate(vocab)}
encode = lambda s: [stoi[word] for word in s] # encoder: take a list of words, output a list of integers
decode = lambda l: ' '.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
# Train and test splits
data = torch.tensor(encode(words), dtype=torch.long)
n = int(0.9 * len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]
class CustomDataset(Dataset):
def __init__(self, data, block_size):
self.data = data
self.block_size = block_size
def __len__(self):
return len(self.data) - self.block_size
def __getitem__(self, idx):
return self.data[idx:idx + self.block_size]
# data loading
custom_dataset = CustomDataset(train_data, block_size)
data_loader = DataLoader(custom_dataset, batch_size=batch_size, shuffle=True)
# data loading
def get_batch(split):
# generate a small batch of data of inputs x and targets y
data = train_data if split == 'train' else val_data
ix = torch.randint(len(data) - block_size, (batch_size,))
x = torch.stack([data[i:i + block_size] for i in ix])
y = torch.stack([data[i + 1:i + block_size + 1] for i in ix])
x, y = x.to(device), y.to(device)
return x, y
@torch.no_grad()
def estimate_loss():
out = {}
model.eval()
for split in ['train', 'val']:
losses = torch.zeros(eval_iters)
for k in range(eval_iters):
X, Y = get_batch(split)
logits, loss = model(X, Y)
losses[k] = loss.item()
out[split] = losses.mean()
model.train()
return out
class Head(nn.Module):
""" one head of self-attention """
def __init__(self, head_size):
super().__init__()
self.key = nn.Linear(n_embd, head_size, bias=False)
self.query = nn.Linear(n_embd, head_size, bias=False)
self.value = nn.Linear(n_embd, head_size, bias=False)
self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
self.dropout = nn.Dropout(dropout)
def forward(self, x):
B, T, C = x.shape
k = self.key(x) # (B,T,C)
q = self.query(x) # (B,T,C)
# compute attention scores ("affinities")
wei = q @ k.transpose(-2, -1) * C ** -0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
wei = F.softmax(wei, dim=-1) # (B, T, T)
wei = self.dropout(wei)
# perform the weighted aggregation of the values
v = self.value(x) # (B,T,C)
out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
return out
class MultiHeadAttention(nn.Module):
""" multiple heads of self-attention in parallel """
def __init__(self, num_heads, head_size):
super().__init__()
self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
self.proj = nn.Linear(n_embd, n_embd)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
out = torch.cat([h(x) for h in self.heads], dim=-1)
out = self.dropout(self.proj(out))
return out
class FeedForward(nn.Module):
""" a simple linear layer followed by a non-linearity """
def __init__(self, n_embd):
super().__init__()
self.net = nn.Sequential(
nn.Linear(n_embd, 4 * n_embd),
nn.ReLU(),
nn.Linear(4 * n_embd, n_embd),
nn.Dropout(dropout),
)
def forward(self, x):
return self.net(x)
class Block(nn.Module):
""" Transformer block: communication followed by computation """
def __init__(self, n_embd, n_head):
# n_embd: embedding dimension, n_head: the number of heads we'd like
super().__init__()
head_size = n_embd // n_head
self.sa = MultiHeadAttention(n_head, head_size)
self.ffwd = FeedForward(n_embd)
self.ln1 = nn.LayerNorm(n_embd)
self.ln2 = nn.LayerNorm(n_embd)
def forward(self, x):
x = x + self.sa(self.ln1(x))
x = x + self.ffwd(self.ln2(x))
return x
# super simple bigram model
class BigramLanguageModel(nn.Module):
def __init__(self):
super().__init__()
# each token directly reads off the logits for the next token from a lookup table
self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
self.position_embedding_table = nn.Embedding(block_size, n_embd)
self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
self.ln_f = nn.LayerNorm(n_embd) # final layer norm
self.lm_head = nn.Linear(n_embd, vocab_size)
def forward(self, idx, targets=None):
B, T = idx.shape
# idx and targets are both (B,T) tensor of integers
tok_emb = self.token_embedding_table(idx) # (B,T,C)
pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
x = tok_emb + pos_emb # (B,T,C)
x = self.blocks(x) # (B,T,C)
x = self.ln_f(x) # (B,T,C)
logits = self.lm_head(x) # (B,T,vocab_size)
if targets is None:
loss = None
else:
B, T, C = logits.shape
logits = logits.view(B * T, C)
targets = targets.view(B * T)
loss = F.cross_entropy(logits, targets)
return logits, loss
def generate(self, idx, max_new_tokens):
# idx is (B, T) array of indices in the current context
for _ in range(max_new_tokens):
# crop idx to the last block_size tokens
idx_cond = idx[:, -block_size:]
# get the predictions
logits, loss = self(idx_cond)
# focus only on the last time step
logits = logits[:, -1, :] # becomes (B, C)
# apply softmax to get probabilities
probs = F.softmax(logits, dim=-1) # (B, C)
# sample from the distribution
idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
# append sampled index to the running sequence
idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
return idx
# Save and load model functions
def save_model(model, path):
torch.save(model.state_dict(), path)
def load_model(model, path):
model.load_state_dict(torch.load(path))
model.eval()
model = BigramLanguageModel()
model = model.to(device)
def main(file_path, additional_iters):
# Read the text from the provided file
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read()
# Check if the pre-trained model exists
pre_trained_model_path = 'bigram_model.pt'
if os.path.exists(pre_trained_model_path):
# Load the pre-trained model
load_model(model, pre_trained_model_path)
print("Pre-trained model loaded.")
else:
print("Training the model...")
# create a PyTorch optimizer
# Training Loop with Gradient Accumulation
accumulation_steps = 8
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
for iter in range(max_iters):
# every once in a while evaluate the loss on train and val sets
if iter % eval_interval == 0 or iter == max_iters - 1:
losses = estimate_loss()
print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
total_loss = 0.0
for i, xb in enumerate(data_loader):
xb = xb.to(device)
yb = torch.cat((xb[:, 1:], xb[:, 0].unsqueeze(1)), dim=1) # Shift by one for autoregressive prediction
# evaluate the loss
logits, loss = model(xb, yb)
loss /= accumulation_steps
total_loss += loss.item()
if (i + 1) % accumulation_steps == 0:
loss.backward()
optimizer.step()
optimizer.zero_grad(set_to_none=True)
total_loss /= len(data_loader)
print(f"Iteration {iter}: average training loss {total_loss:.4f}")
# Save the trained model after each training iteration
save_model(model, pre_trained_model_path)
print("Model saved.")
# Additional training iterations
if additional_iters > 0:
print(f"Continuing training for {additional_iters} additional iterations...")
for iter in range(additional_iters):
total_loss = 0.0
for i, xb in enumerate(data_loader):
xb = xb.to(device)
yb = torch.cat((xb[:, 1:], xb[:, 0].unsqueeze(1)), dim=1) # Shift by one for autoregressive prediction
# evaluate the loss
logits, loss = model(xb, yb)
loss /= accumulation_steps
total_loss += loss.item()
if (i + 1) % accumulation_steps == 0:
loss.backward()
optimizer.step()
optimizer.zero_grad(set_to_none=True)
total_loss /= len(data_loader)
print(f"Additional Iteration {iter + 1}: average training loss {total_loss:.4f}")
# Save the trained model after each additional training iteration
save_model(model, pre_trained_model_path)
print("Model saved.")
print(f"Additional {additional_iters} iterations completed.")
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: python llm4.py <file_path> <additional_iters>")
else:
file_path = sys.argv[1]
additional_iters = int(sys.argv[2])
main(file_path, additional_iters)