Spaces:

Ashish-R
/

LLMFromScratch

Sleeping

App Files Files Community

Ashish Reddy commited on Jun 10, 2025

Commit

dde4f12

1 Parent(s): a6be05a

added all blocks of codes

Browse files

Files changed (6) hide show

.gitignore +2 -0
block.py +38 -0
feedForward.py +29 -0
head.py +44 -0
input.txt +0 -0
multiHead.py +31 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *.DS_Store
2	+ __pycache__/

block.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import torch, torch.nn as nn
+batch_size = 64
+max_len = 256
+d_model = 384
+n_head = 6
+d_q = int(d_model / n_head)
+dropout = 0.2
+from multiHead import MultiHead
+from feedForward import FeedForward
+class Block(nn.Module):
+    def __init__(self, d_model, n_head):
+        super().__init__()
+        self.multiHead = MultiHead(n_head, d_q)
+        self.ffwd = FeedForward(d_model)
+        self.ln1 = nn.LayerNorm(d_model)
+        self.ln2 = nn.LayerNorm(d_model)
+    def forward(self, x):
+        normalized1 = self.ln1(x) # It is seen that pre LN works better as it stabilizes better
+        multiHead = self.multiHead(normalized1)
+        x = x + multiHead # Residual Addition
+        normalized2 = self.ln2(x)
+        ffwd = self.ffwd(normalized2)
+        x = x + ffwd
+        return x
+if __name__ == "__main__":
+    x = torch.randn(batch_size, max_len, d_model)
+    block = Block(d_model, n_head)
+    output = block(x)
+    print("Input shape:", x.shape)
+    print("Output shape from one Transformer Block:", output.shape)

feedForward.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import torch, torch.nn as nn
+batch_size = 64
+max_len = 256
+d_model = 384
+n_head = 6
+d_q = int(d_model / n_head)
+dropout = 0.2
+class FeedForward(nn.Module):
+    def __init__(self, d_model):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Linear(d_model, 4*d_model), # Expand dimension, so when applying ReLU, you mitigate loss of information issue
+            nn.ReLU(),
+            nn.Linear(4*d_model, d_model),
+            nn.Dropout(dropout)
+        )
+    def forward(self, x):
+        return self.seq(x)
+if __name__ == '__main__':
+    x = torch.randn(batch_size, max_len, d_model)
+    ffwd = FeedForward(d_model)
+    output = ffwd(x)
+    print("Input shape:", x.shape)
+    print("Output shape from FeedForward network:", output.shape)

head.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import torch, torch.nn as nn, torch.nn.functional as F
+batch_size = 64
+max_len = 256
+d_model = 384
+n_head = 6
+d_q = int (d_model / n_head) # 384/6 = 64
+dropout = 0.2
+class Head(nn.Module):
+    def __init__(self, d_q):
+        super().__init__()
+        self.query = nn.Linear(d_model, d_q, bias=False) # Query weight matrix (Wq) = Linear, pass in x with shape (seq, 384) * (384, 64) to get q = (seq, 64) size
+        self.key = nn.Linear(d_model, d_q, bias=False)   # k = x * Wk
+        self.value = nn.Linear(d_model, d_q, bias=False) # v = x * Wv
+        self.register_buffer('tril', torch.tril(torch.ones(max_len, max_len))) # Save it to register_buffer, as a non-trainable parameter / buffer
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        B, S, D = x.shape # B --> Batch; S --> Seq_length; D --> Dimension
+        q = self.query(x) # Shape of q: (Batch, Seq_Len, d_q) = (B, S, 64)
+        k = self.key(x)
+        v = self.value(x)
+        attention_matrix = torch.matmul(q, k.transpose(-2, -1)) # --> (B, S, 64) * (B, 64, S) --> (B, S, S) shape
+        attention_matrix = attention_matrix / (k.size(-1) ** 0.5)
+        attention_matrix = attention_matrix.masked_fill(self.tril[:S, :S] == 0, float('-inf')) # Makes upper right triangle True because they are all 0s and all 1s (lower half of triangle) false and wherever it is True, fill it in with -inf or in other words fill the spots with 0s as -inf so as we are creating a causal decoder that isn't bidirectional
+        attention_matrix = F.softmax(attention_matrix, dim=-1) # dim = -1, to apply softmax row-wise
+        attention_matrix = self.dropout(attention_matrix) # Apply 20% dropout to prevent overfitting
+        output = torch.matmul(attention_matrix, v) # --> (B, S, S) * (B, S, 64) --> (B, S, 64) (Original x dimension after concat, so you can now simply add)
+        return output
+if __name__ == "__main__":
+    x = torch.randn(batch_size, max_len, d_model)
+    single_head = Head(d_q)
+    output = single_head(x)
+    print("Input shape:", x.shape)
+    print("Output shape from a single head:", output.shape)

input.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

multiHead.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import torch, torch.nn as nn
+batch_size = 64
+max_len = 256
+d_model = 384
+n_head = 6
+d_q = int(d_model / n_head)
+dropout = 0.2
+from head import Head
+class MultiHead(nn.Module):
+    def __init__(self, n_head, d_q):
+        super().__init__()
+        self.heads = nn.ModuleList([Head(d_q) for _ in range(n_head)]) # Create a list of 6 heads with different randomized weights each
+        self.proj = nn.Linear(d_model, d_model) # You concat your 6 heads to shape (B, S, 384) * (384, 384) --> (B, S, 384) (Ready to be added! Residual connection)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        concatenated_outputs = torch.cat([_(x) for _ in self.heads], dim=-1) # Concat each output (B, S, 64) horizontally to get (B, S, 384) as there are 6 heads
+        output = self.proj(concatenated_outputs)
+        output = self.dropout(output)
+        return output
+if __name__ == "__main__":
+    x = torch.randn(batch_size, max_len, d_model)
+    multi_head = MultiHead(n_head, d_q)
+    output = multi_head(x)
+    print("Input shape:", x.shape)
+    print("Output shape from multi-head:", output.shape)