Ashish Reddy commited on
Commit
dde4f12
·
1 Parent(s): a6be05a

added all blocks of codes

Browse files
Files changed (6) hide show
  1. .gitignore +2 -0
  2. block.py +38 -0
  3. feedForward.py +29 -0
  4. head.py +44 -0
  5. input.txt +0 -0
  6. multiHead.py +31 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *.DS_Store
2
+ __pycache__/
block.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch, torch.nn as nn
2
+
3
+ batch_size = 64
4
+ max_len = 256
5
+ d_model = 384
6
+ n_head = 6
7
+ d_q = int(d_model / n_head)
8
+ dropout = 0.2
9
+
10
+ from multiHead import MultiHead
11
+ from feedForward import FeedForward
12
+
13
+ class Block(nn.Module):
14
+ def __init__(self, d_model, n_head):
15
+ super().__init__()
16
+ self.multiHead = MultiHead(n_head, d_q)
17
+ self.ffwd = FeedForward(d_model)
18
+ self.ln1 = nn.LayerNorm(d_model)
19
+ self.ln2 = nn.LayerNorm(d_model)
20
+
21
+ def forward(self, x):
22
+ normalized1 = self.ln1(x) # It is seen that pre LN works better as it stabilizes better
23
+ multiHead = self.multiHead(normalized1)
24
+ x = x + multiHead # Residual Addition
25
+
26
+ normalized2 = self.ln2(x)
27
+ ffwd = self.ffwd(normalized2)
28
+ x = x + ffwd
29
+
30
+ return x
31
+
32
+ if __name__ == "__main__":
33
+ x = torch.randn(batch_size, max_len, d_model)
34
+ block = Block(d_model, n_head)
35
+ output = block(x)
36
+
37
+ print("Input shape:", x.shape)
38
+ print("Output shape from one Transformer Block:", output.shape)
feedForward.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch, torch.nn as nn
2
+
3
+ batch_size = 64
4
+ max_len = 256
5
+ d_model = 384
6
+ n_head = 6
7
+ d_q = int(d_model / n_head)
8
+ dropout = 0.2
9
+
10
+ class FeedForward(nn.Module):
11
+ def __init__(self, d_model):
12
+ super().__init__()
13
+ self.seq = nn.Sequential(
14
+ nn.Linear(d_model, 4*d_model), # Expand dimension, so when applying ReLU, you mitigate loss of information issue
15
+ nn.ReLU(),
16
+ nn.Linear(4*d_model, d_model),
17
+ nn.Dropout(dropout)
18
+ )
19
+
20
+ def forward(self, x):
21
+ return self.seq(x)
22
+
23
+ if __name__ == '__main__':
24
+ x = torch.randn(batch_size, max_len, d_model)
25
+ ffwd = FeedForward(d_model)
26
+ output = ffwd(x)
27
+
28
+ print("Input shape:", x.shape)
29
+ print("Output shape from FeedForward network:", output.shape)
head.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch, torch.nn as nn, torch.nn.functional as F
2
+
3
+ batch_size = 64
4
+ max_len = 256
5
+ d_model = 384
6
+ n_head = 6
7
+ d_q = int (d_model / n_head) # 384/6 = 64
8
+ dropout = 0.2
9
+
10
+ class Head(nn.Module):
11
+ def __init__(self, d_q):
12
+ super().__init__()
13
+ self.query = nn.Linear(d_model, d_q, bias=False) # Query weight matrix (Wq) = Linear, pass in x with shape (seq, 384) * (384, 64) to get q = (seq, 64) size
14
+ self.key = nn.Linear(d_model, d_q, bias=False) # k = x * Wk
15
+ self.value = nn.Linear(d_model, d_q, bias=False) # v = x * Wv
16
+
17
+ self.register_buffer('tril', torch.tril(torch.ones(max_len, max_len))) # Save it to register_buffer, as a non-trainable parameter / buffer
18
+ self.dropout = nn.Dropout(dropout)
19
+
20
+ def forward(self, x):
21
+ B, S, D = x.shape # B --> Batch; S --> Seq_length; D --> Dimension
22
+
23
+ q = self.query(x) # Shape of q: (Batch, Seq_Len, d_q) = (B, S, 64)
24
+ k = self.key(x)
25
+ v = self.value(x)
26
+
27
+ attention_matrix = torch.matmul(q, k.transpose(-2, -1)) # --> (B, S, 64) * (B, 64, S) --> (B, S, S) shape
28
+ attention_matrix = attention_matrix / (k.size(-1) ** 0.5)
29
+
30
+ attention_matrix = attention_matrix.masked_fill(self.tril[:S, :S] == 0, float('-inf')) # Makes upper right triangle True because they are all 0s and all 1s (lower half of triangle) false and wherever it is True, fill it in with -inf or in other words fill the spots with 0s as -inf so as we are creating a causal decoder that isn't bidirectional
31
+
32
+ attention_matrix = F.softmax(attention_matrix, dim=-1) # dim = -1, to apply softmax row-wise
33
+ attention_matrix = self.dropout(attention_matrix) # Apply 20% dropout to prevent overfitting
34
+ output = torch.matmul(attention_matrix, v) # --> (B, S, S) * (B, S, 64) --> (B, S, 64) (Original x dimension after concat, so you can now simply add)
35
+
36
+ return output
37
+
38
+ if __name__ == "__main__":
39
+ x = torch.randn(batch_size, max_len, d_model)
40
+ single_head = Head(d_q)
41
+ output = single_head(x)
42
+
43
+ print("Input shape:", x.shape)
44
+ print("Output shape from a single head:", output.shape)
input.txt ADDED
The diff for this file is too large to render. See raw diff
 
multiHead.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch, torch.nn as nn
2
+
3
+ batch_size = 64
4
+ max_len = 256
5
+ d_model = 384
6
+ n_head = 6
7
+ d_q = int(d_model / n_head)
8
+ dropout = 0.2
9
+
10
+ from head import Head
11
+
12
+ class MultiHead(nn.Module):
13
+ def __init__(self, n_head, d_q):
14
+ super().__init__()
15
+ self.heads = nn.ModuleList([Head(d_q) for _ in range(n_head)]) # Create a list of 6 heads with different randomized weights each
16
+ self.proj = nn.Linear(d_model, d_model) # You concat your 6 heads to shape (B, S, 384) * (384, 384) --> (B, S, 384) (Ready to be added! Residual connection)
17
+ self.dropout = nn.Dropout(dropout)
18
+
19
+ def forward(self, x):
20
+ concatenated_outputs = torch.cat([_(x) for _ in self.heads], dim=-1) # Concat each output (B, S, 64) horizontally to get (B, S, 384) as there are 6 heads
21
+ output = self.proj(concatenated_outputs)
22
+ output = self.dropout(output)
23
+ return output
24
+
25
+ if __name__ == "__main__":
26
+ x = torch.randn(batch_size, max_len, d_model)
27
+ multi_head = MultiHead(n_head, d_q)
28
+ output = multi_head(x)
29
+
30
+ print("Input shape:", x.shape)
31
+ print("Output shape from multi-head:", output.shape)