Spaces:

Ashish-R
/

LLMFromScratch

Running

LLMFromScratch / train.py

Ashish Reddy

8ea429a 7 months ago

3.76 kB

	import torch, torch.nn as nn, torch.optim as optim, torch.nn.functional as F, time

	batch_size = 64
	max_len = 256
	d_model = 384
	n_layer = 6
	n_head = 6
	d_q = int(d_model / n_head)
	dropout = 0.2
	vocab_size = 65

	max_iters = 5000
	eval_interval = 500
	learning_rate = 3e-4
	eval_iters = 200

	"""
	---- Device ----
	"""

	if torch.cuda.is_available():
	device = torch.device('cuda')
	print("Using CUDA (GPU)")
	elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
	device = torch.device('mps')
	print("Using MPS (Apple Silicon GPU)")
	else:
	device = torch.device('cpu')
	print("Using device's CPU")

	with open('input.txt', 'r', encoding='utf-8') as f:
	text = f.read()

	chars = sorted(list(set(text))) # --> All unique characters within the text
	vocab_size = len(chars) # 65 different characters in text

	stoi = {}
	itos = {}

	for i in range(len(chars)):
	stoi[chars[i]] = i # Convert strings to ints
	itos[i] = chars[i] # Convert ints to strings

	# Take a string, and output its characters indices in a list
	def encoder(s):
	res = []
	for char in s:
	res.append(stoi[char])
	return res

	# Take a list of indices and output a string
	def decoder(l):
	res = ""
	for i in l:
	res += itos[i]
	return res

	data = torch.tensor(encoder(text), dtype=torch.long) # --> Same shape as length, i.e., number of characters

	n = int(0.9 * len(data))
	train_data = data[:n] # 90% of text
	val_data = data[n:] # 10% of text

	def get_batch(split):
	if split.lower() == 'train':
	data = train_data
	else:
	data = val_data

	ix = torch.randint(len(data)-max_len, (batch_size,)) # Generate batch_size=64 random numbers from 0 to len(data)-max_len

	x = torch.stack([data[i:i+max_len] for i in ix]) # Generates 250 ids from that random number and stacks batch_size by rows, so shape[64, 256]
	y = torch.stack([data[i+1:i+max_len+1] for i in ix]) # This is done in order to test teh real y with the later predicted y by the model using cross entropy and update weights

	return x.to(device), y.to(device)

	"""
	--- Model Training ---
	"""

	if __name__ == "__main__":

	from model import Model

	model = Model()
	m = model.to(device)

	optimizer = optim.AdamW(
	model.parameters(),
	lr=learning_rate
	)

	@torch.no_grad
	def estimate_loss():
	out = {}
	model.eval()
	for split in ['train', 'val']:
	losses = torch.zeros(eval_iters)
	for k in range(eval_iters):
	X, Y = get_batch(split)
	logits, loss = model(X, Y)
	losses[k] = loss.item()
	out[split] = losses.mean()
	model.train()
	return out

	for iter in range(max_iters):
	if iter % eval_interval == 0 or iter == max_iters - 1:
	losses = estimate_loss()
	print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

	iter_start = time.time()
	xb, yb = get_batch("train")
	logits, loss = model(xb, yb)
	optimizer.zero_grad(set_to_none=True) # Required for new resetting as after iter, new set of batches will come
	loss.backward() # Required for back passing, it gives you the amount of steepness and gradient
	optimizer.step() # Required for actually nudging in that given direction (Taking a plausible value of lr right now but it influences a lot)

	iter_time = time.time() - iter_start
	print(f"Iteration {iter} completed in {iter_time:.2f} seconds")

	print("Training finished. Saving model state...")
	torch.save(model.state_dict(), 'nanogpt_model.pth')
	print("Model saved to nanogpt_model.pth")