import torch, torch.nn as nn, torch.optim as optim, torch.nn.functional as F, time batch_size = 64 max_len = 256 d_model = 384 n_layer = 6 n_head = 6 d_q = int(d_model / n_head) dropout = 0.2 vocab_size = 65 max_iters = 5000 eval_interval = 500 learning_rate = 3e-4 eval_iters = 200 """ ---- Device ---- """ if torch.cuda.is_available(): device = torch.device('cuda') print("Using CUDA (GPU)") elif torch.backends.mps.is_available() and torch.backends.mps.is_built(): device = torch.device('mps') print("Using MPS (Apple Silicon GPU)") else: device = torch.device('cpu') print("Using device's CPU") with open('input.txt', 'r', encoding='utf-8') as f: text = f.read() chars = sorted(list(set(text))) # --> All unique characters within the text vocab_size = len(chars) # 65 different characters in text stoi = {} itos = {} for i in range(len(chars)): stoi[chars[i]] = i # Convert strings to ints itos[i] = chars[i] # Convert ints to strings # Take a string, and output its characters indices in a list def encoder(s): res = [] for char in s: res.append(stoi[char]) return res # Take a list of indices and output a string def decoder(l): res = "" for i in l: res += itos[i] return res data = torch.tensor(encoder(text), dtype=torch.long) # --> Same shape as length, i.e., number of characters n = int(0.9 * len(data)) train_data = data[:n] # 90% of text val_data = data[n:] # 10% of text def get_batch(split): if split.lower() == 'train': data = train_data else: data = val_data ix = torch.randint(len(data)-max_len, (batch_size,)) # Generate batch_size=64 random numbers from 0 to len(data)-max_len x = torch.stack([data[i:i+max_len] for i in ix]) # Generates 250 ids from that random number and stacks batch_size by rows, so shape[64, 256] y = torch.stack([data[i+1:i+max_len+1] for i in ix]) # This is done in order to test teh real y with the later predicted y by the model using cross entropy and update weights return x.to(device), y.to(device) """ --- Model Training --- """ if __name__ == "__main__": from model import Model model = Model() m = model.to(device) optimizer = optim.AdamW( model.parameters(), lr=learning_rate ) @torch.no_grad def estimate_loss(): out = {} model.eval() for split in ['train', 'val']: losses = torch.zeros(eval_iters) for k in range(eval_iters): X, Y = get_batch(split) logits, loss = model(X, Y) losses[k] = loss.item() out[split] = losses.mean() model.train() return out for iter in range(max_iters): if iter % eval_interval == 0 or iter == max_iters - 1: losses = estimate_loss() print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}") iter_start = time.time() xb, yb = get_batch("train") logits, loss = model(xb, yb) optimizer.zero_grad(set_to_none=True) # Required for new resetting as after iter, new set of batches will come loss.backward() # Required for back passing, it gives you the amount of steepness and gradient optimizer.step() # Required for actually nudging in that given direction (Taking a plausible value of lr right now but it influences a lot) iter_time = time.time() - iter_start print(f"Iteration {iter} completed in {iter_time:.2f} seconds") print("Training finished. Saving model state...") torch.save(model.state_dict(), 'nanogpt_model.pth') print("Model saved to nanogpt_model.pth")