Spaces:
Sleeping
Sleeping
Ashish Reddy
commited on
Commit
·
dde4f12
1
Parent(s):
a6be05a
added all blocks of codes
Browse files- .gitignore +2 -0
- block.py +38 -0
- feedForward.py +29 -0
- head.py +44 -0
- input.txt +0 -0
- multiHead.py +31 -0
.gitignore
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.DS_Store
|
| 2 |
+
__pycache__/
|
block.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch, torch.nn as nn
|
| 2 |
+
|
| 3 |
+
batch_size = 64
|
| 4 |
+
max_len = 256
|
| 5 |
+
d_model = 384
|
| 6 |
+
n_head = 6
|
| 7 |
+
d_q = int(d_model / n_head)
|
| 8 |
+
dropout = 0.2
|
| 9 |
+
|
| 10 |
+
from multiHead import MultiHead
|
| 11 |
+
from feedForward import FeedForward
|
| 12 |
+
|
| 13 |
+
class Block(nn.Module):
|
| 14 |
+
def __init__(self, d_model, n_head):
|
| 15 |
+
super().__init__()
|
| 16 |
+
self.multiHead = MultiHead(n_head, d_q)
|
| 17 |
+
self.ffwd = FeedForward(d_model)
|
| 18 |
+
self.ln1 = nn.LayerNorm(d_model)
|
| 19 |
+
self.ln2 = nn.LayerNorm(d_model)
|
| 20 |
+
|
| 21 |
+
def forward(self, x):
|
| 22 |
+
normalized1 = self.ln1(x) # It is seen that pre LN works better as it stabilizes better
|
| 23 |
+
multiHead = self.multiHead(normalized1)
|
| 24 |
+
x = x + multiHead # Residual Addition
|
| 25 |
+
|
| 26 |
+
normalized2 = self.ln2(x)
|
| 27 |
+
ffwd = self.ffwd(normalized2)
|
| 28 |
+
x = x + ffwd
|
| 29 |
+
|
| 30 |
+
return x
|
| 31 |
+
|
| 32 |
+
if __name__ == "__main__":
|
| 33 |
+
x = torch.randn(batch_size, max_len, d_model)
|
| 34 |
+
block = Block(d_model, n_head)
|
| 35 |
+
output = block(x)
|
| 36 |
+
|
| 37 |
+
print("Input shape:", x.shape)
|
| 38 |
+
print("Output shape from one Transformer Block:", output.shape)
|
feedForward.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch, torch.nn as nn
|
| 2 |
+
|
| 3 |
+
batch_size = 64
|
| 4 |
+
max_len = 256
|
| 5 |
+
d_model = 384
|
| 6 |
+
n_head = 6
|
| 7 |
+
d_q = int(d_model / n_head)
|
| 8 |
+
dropout = 0.2
|
| 9 |
+
|
| 10 |
+
class FeedForward(nn.Module):
|
| 11 |
+
def __init__(self, d_model):
|
| 12 |
+
super().__init__()
|
| 13 |
+
self.seq = nn.Sequential(
|
| 14 |
+
nn.Linear(d_model, 4*d_model), # Expand dimension, so when applying ReLU, you mitigate loss of information issue
|
| 15 |
+
nn.ReLU(),
|
| 16 |
+
nn.Linear(4*d_model, d_model),
|
| 17 |
+
nn.Dropout(dropout)
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
def forward(self, x):
|
| 21 |
+
return self.seq(x)
|
| 22 |
+
|
| 23 |
+
if __name__ == '__main__':
|
| 24 |
+
x = torch.randn(batch_size, max_len, d_model)
|
| 25 |
+
ffwd = FeedForward(d_model)
|
| 26 |
+
output = ffwd(x)
|
| 27 |
+
|
| 28 |
+
print("Input shape:", x.shape)
|
| 29 |
+
print("Output shape from FeedForward network:", output.shape)
|
head.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch, torch.nn as nn, torch.nn.functional as F
|
| 2 |
+
|
| 3 |
+
batch_size = 64
|
| 4 |
+
max_len = 256
|
| 5 |
+
d_model = 384
|
| 6 |
+
n_head = 6
|
| 7 |
+
d_q = int (d_model / n_head) # 384/6 = 64
|
| 8 |
+
dropout = 0.2
|
| 9 |
+
|
| 10 |
+
class Head(nn.Module):
|
| 11 |
+
def __init__(self, d_q):
|
| 12 |
+
super().__init__()
|
| 13 |
+
self.query = nn.Linear(d_model, d_q, bias=False) # Query weight matrix (Wq) = Linear, pass in x with shape (seq, 384) * (384, 64) to get q = (seq, 64) size
|
| 14 |
+
self.key = nn.Linear(d_model, d_q, bias=False) # k = x * Wk
|
| 15 |
+
self.value = nn.Linear(d_model, d_q, bias=False) # v = x * Wv
|
| 16 |
+
|
| 17 |
+
self.register_buffer('tril', torch.tril(torch.ones(max_len, max_len))) # Save it to register_buffer, as a non-trainable parameter / buffer
|
| 18 |
+
self.dropout = nn.Dropout(dropout)
|
| 19 |
+
|
| 20 |
+
def forward(self, x):
|
| 21 |
+
B, S, D = x.shape # B --> Batch; S --> Seq_length; D --> Dimension
|
| 22 |
+
|
| 23 |
+
q = self.query(x) # Shape of q: (Batch, Seq_Len, d_q) = (B, S, 64)
|
| 24 |
+
k = self.key(x)
|
| 25 |
+
v = self.value(x)
|
| 26 |
+
|
| 27 |
+
attention_matrix = torch.matmul(q, k.transpose(-2, -1)) # --> (B, S, 64) * (B, 64, S) --> (B, S, S) shape
|
| 28 |
+
attention_matrix = attention_matrix / (k.size(-1) ** 0.5)
|
| 29 |
+
|
| 30 |
+
attention_matrix = attention_matrix.masked_fill(self.tril[:S, :S] == 0, float('-inf')) # Makes upper right triangle True because they are all 0s and all 1s (lower half of triangle) false and wherever it is True, fill it in with -inf or in other words fill the spots with 0s as -inf so as we are creating a causal decoder that isn't bidirectional
|
| 31 |
+
|
| 32 |
+
attention_matrix = F.softmax(attention_matrix, dim=-1) # dim = -1, to apply softmax row-wise
|
| 33 |
+
attention_matrix = self.dropout(attention_matrix) # Apply 20% dropout to prevent overfitting
|
| 34 |
+
output = torch.matmul(attention_matrix, v) # --> (B, S, S) * (B, S, 64) --> (B, S, 64) (Original x dimension after concat, so you can now simply add)
|
| 35 |
+
|
| 36 |
+
return output
|
| 37 |
+
|
| 38 |
+
if __name__ == "__main__":
|
| 39 |
+
x = torch.randn(batch_size, max_len, d_model)
|
| 40 |
+
single_head = Head(d_q)
|
| 41 |
+
output = single_head(x)
|
| 42 |
+
|
| 43 |
+
print("Input shape:", x.shape)
|
| 44 |
+
print("Output shape from a single head:", output.shape)
|
input.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
multiHead.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch, torch.nn as nn
|
| 2 |
+
|
| 3 |
+
batch_size = 64
|
| 4 |
+
max_len = 256
|
| 5 |
+
d_model = 384
|
| 6 |
+
n_head = 6
|
| 7 |
+
d_q = int(d_model / n_head)
|
| 8 |
+
dropout = 0.2
|
| 9 |
+
|
| 10 |
+
from head import Head
|
| 11 |
+
|
| 12 |
+
class MultiHead(nn.Module):
|
| 13 |
+
def __init__(self, n_head, d_q):
|
| 14 |
+
super().__init__()
|
| 15 |
+
self.heads = nn.ModuleList([Head(d_q) for _ in range(n_head)]) # Create a list of 6 heads with different randomized weights each
|
| 16 |
+
self.proj = nn.Linear(d_model, d_model) # You concat your 6 heads to shape (B, S, 384) * (384, 384) --> (B, S, 384) (Ready to be added! Residual connection)
|
| 17 |
+
self.dropout = nn.Dropout(dropout)
|
| 18 |
+
|
| 19 |
+
def forward(self, x):
|
| 20 |
+
concatenated_outputs = torch.cat([_(x) for _ in self.heads], dim=-1) # Concat each output (B, S, 64) horizontally to get (B, S, 384) as there are 6 heads
|
| 21 |
+
output = self.proj(concatenated_outputs)
|
| 22 |
+
output = self.dropout(output)
|
| 23 |
+
return output
|
| 24 |
+
|
| 25 |
+
if __name__ == "__main__":
|
| 26 |
+
x = torch.randn(batch_size, max_len, d_model)
|
| 27 |
+
multi_head = MultiHead(n_head, d_q)
|
| 28 |
+
output = multi_head(x)
|
| 29 |
+
|
| 30 |
+
print("Input shape:", x.shape)
|
| 31 |
+
print("Output shape from multi-head:", output.shape)
|