upload model

Files changed (8) hide show

components/activation.py +16 -0
components/feed_forward.py +22 -0
components/layer_norm.py +26 -0
components/multi_head_attention.py +57 -0
components/transformer_block.py +57 -0
config.json +10 -0
gpt_model.py +123 -0
model_896_14_8_256.pth +3 -0

components/activation.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import torch
+import torch.nn as nn
+class GELU(nn.Module):
+    '''
+    GELU (Gausian Error Linear Unit) activation function.
+    '''
+    def __init__(self):
+        super().__init__()
+    def forward(self, x):
+        return 0.5 * x * (1 + torch.tanh(
+            torch.sqrt(torch.tensor(2.0 / torch.pi)) *
+             (x + 0.044715 * torch.pow(x, 3))
+        ))

components/feed_forward.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import torch
+import torch.nn as nn
+from activation import GELU
+class FeedForward(nn.Module):
+    '''
+    Feed-forward neural network with GELU activation function.
+    - Multi-Head Self-Attention → Captures relationships between tokens.
+    - Feedforward Neural Network (FFN) → Processes each token independently after attention.
+    '''
+    def __init__(self, cfg):
+        super().__init__()
+        self.layers = nn.Sequential(
+            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
+            GELU(),
+            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
+        )
+    def forward(self, x):
+        return self.layers(x)

components/layer_norm.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import torch
+import torch.nn as nn
+class LayerNorm(nn.Module):
+    def __init__(self, emb_dim):
+        super().__init__()
+        self.eps = 1e-6 # small value to avoid division by zero
+        self.scale = nn.Parameter(torch.ones(emb_dim)) # trainable scale parameter
+        self.shift = nn.Parameter(torch.zeros(emb_dim)) # trainable shift parameter
+    def forward(self, x):
+        '''
+        In this implementation of Layer Normalization, the normalization is applied along
+        the last dimension of the input tensor 𝑋, which represents the embedding dimension (dim=-1).
+        Normalizing over the embedding dimension ensures that each word is treated independently,
+        preventing one word from influencing another.
+        For Transformer models, input data typically has the following shape:
+                        [batch_size, seq_len, emb_dim]
+        '''
+        mean = x.mean(dim=-1, keepdim=True)
+        var = x.var(dim=-1, keepdim=True, unbiased=False) # unbiased=False means that the variance is calculated with the Bessel correction
+        norm_x = (x - mean) / torch.sqrt(var + self.eps)
+        return self.scale * norm_x + self.shift

components/multi_head_attention.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import torch
+import torch.nn as nn
+class MultiHeadAttention(nn.Module):
+    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
+        super().__init__()
+        assert d_out % num_heads == 0, "d_out must be divisible by num_heads"
+        self.d_out = d_out
+        self.num_heads = num_heads
+        self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim
+        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
+        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
+        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
+        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
+        self.dropout = nn.Dropout(dropout)
+        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))
+    def forward(self, x):
+        b, num_tokens, _ = x.shape
+        keys = self.W_key(x) # Shape: (b, num_tokens, d_out)
+        queries = self.W_query(x)
+        values = self.W_value(x)
+        # We implicitly split the matrix by adding a `num_heads` dimension
+        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
+        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
+        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
+        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)
+        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
+        keys = keys.transpose(1, 2)
+        queries = queries.transpose(1, 2)
+        values = values.transpose(1, 2)
+        # Compute scaled dot-product attention (aka self-attention) with a causal mask
+        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head
+        # Original mask truncated to the number of tokens and converted to boolean
+        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
+        # Use the mask to fill attention scores
+        attn_scores.masked_fill_(mask_bool, -torch.inf)
+        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
+        attn_weights = self.dropout(attn_weights)
+        # Shape: (b, num_tokens, num_heads, head_dim)
+        context_vec = (attn_weights @ values).transpose(1, 2)
+        # Combine heads, where self.d_out = self.num_heads * self.head_dim
+        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
+        context_vec = self.out_proj(context_vec) # optional projection
+        return context_vec, attn_weights

components/transformer_block.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import torch
+import torch.nn as nn
+from feed_forward import FeedForward
+from multi_head_attention import MultiHeadAttention
+class TransformerBlock(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.att = MultiHeadAttention(
+            d_in=cfg["emb_dim"],
+            d_out=cfg["emb_dim"],
+            context_length=cfg["context_length"],
+            num_heads=cfg["n_heads"],
+            dropout=cfg["drop_rate"],
+            qkv_bias=cfg["qkv_bias"])
+        self.ff = FeedForward(cfg)
+        self.norm1 = nn.LayerNorm(cfg["emb_dim"])
+        self.norm2 = nn.LayerNorm(cfg["emb_dim"])
+        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])
+    def forward(self, x):
+        '''
+        The transformer block consists of two main components:
+        - Multi-Head Self-Attention → Captures relationships between tokens.
+        - Feedforward Neural Network (FFN) → Processes each token independently after attention.
+        The output of the attention block is added to the input of the block (skip connection),
+        which is then normalized using LayerNorm. The output is then passed through the FFN,
+        and the result is again added to the input of the block and normalized.
+        The dropout is applied to the skip connections before adding them to the output of the
+        attention and FFN blocks. This helps to prevent overfitting and improves generalization.
+        Args:
+            x (torch.Tensor): Input tensor of shape [batch_size, seq_len, emb_dim].
+        Returns:
+            torch.Tensor: Output tensor of shape [batch_size, seq_len, emb_dim].
+        '''
+        shortcut = x
+        x = self.norm1(x)
+        x, _attn_weights = self.att(x)
+        x = self.drop_shortcut(x)
+        x = x + shortcut
+        shortcut = x
+        x = self.norm2(x)
+        x = self.ff(x)
+        x = self.drop_shortcut(x)
+        x = x + shortcut
+        return x, _attn_weights

config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "model_type": "gpt_and_prejudice",
+  "vocab_size": 50257,
+  "context_length": 256,
+  "emb_dim": 896,
+  "n_heads": 14,
+  "n_layers": 8,
+  "drop_rate": 0.2,
+  "qkv_bias": true
+}

gpt_model.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import torch
+import torch.nn as nn
+from typing import Dict, Any, Optional
+from components.transformer_block import TransformerBlock
+from components.layer_norm import LayerNorm
+class InterventionPlan:
+    """
+    Hook object consulted during forward() to optionally replace activations.
+    Override any of the methods below in your experiment code.
+    """
+    def maybe_replace_resid_pre(self, layer_idx: int, x: torch.Tensor) -> torch.Tensor:
+        return x
+    def maybe_replace_resid_post(self, layer_idx: int, x: torch.Tensor) -> torch.Tensor:
+        return x
+    # Optional: only works if your blocks expose per-head z or mlp outputs.
+    def maybe_replace_head_z(self, layer_idx: int, z: torch.Tensor) -> torch.Tensor:
+        return z
+    def maybe_replace_mlp_out(self, layer_idx: int, h: torch.Tensor) -> torch.Tensor:
+        return h
+class GPTModel(nn.Module):
+    def __init__(self, cfg: Dict[str, Any]):
+        super().__init__()
+        self.cfg = cfg
+        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"]
+        )
+        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"]
+        )
+        self.drop_emb = nn.Dropout(cfg["drop_rate"]
+        )
+        self.trf_blocks = nn.Sequential(
+            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]
+        )
+        self.final_norm = LayerNorm(cfg["emb_dim"]
+        )
+        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)
+    @torch.no_grad()
+    def cache_forward(self, in_idx: torch.Tensor):
+        """Run a forward pass with caching enabled (no interventions)."""
+        return self.forward(in_idx, enable_cache=True)
+    def forward(
+        self,
+        in_idx: torch.Tensor,
+        enable_cache: bool = False,
+        intervention_plan: Optional[InterventionPlan] = None,
+        output_hidden_states: bool = False,
+        output_attentions_weights: bool = False,
+        # Backward-compat args (ignored if plan is provided)
+        intervene_layer: Optional[int] = None,
+        edited_hidden: Optional[torch.Tensor] = None,
+    ):
+        """
+        Mechanistic interpretability-friendly forward.
+        Returns: logits, (optional) cache, (optional) hidden_states, (optional) attn_weights
+        Cache keys: resid_pre[L], resid_post[L], attn_weights[L]
+        """
+        B, T = in_idx.shape
+        device = in_idx.device
+        tok_embeds = self.tok_emb(in_idx)                    # [B,T,d]
+        pos_embeds = self.pos_emb(torch.arange(T, device=device))  # [T,d]
+        x = self.drop_emb(tok_embeds + pos_embeds)
+        cache: Dict[str, Dict[int, torch.Tensor]] = {}
+        if enable_cache:
+            cache = {"resid_pre": {}, "resid_post": {}, "attn_weights": {}}
+        hidden_states = []
+        attention_weights_per_layer = []
+        # Fallback to legacy single-layer intervention if no plan provided
+        legacy_layer = intervene_layer if (intervention_plan is None) else None
+        legacy_edit = edited_hidden if (intervention_plan is None) else None
+        for L, block in enumerate(self.trf_blocks):
+            if enable_cache:
+                cache["resid_pre"][L] = x.detach()
+            # Entry-point intervention
+            if intervention_plan is not None:
+                x = intervention_plan.maybe_replace_resid_pre(L, x)
+            elif legacy_layer is not None and legacy_edit is not None and L == legacy_layer:
+                x = legacy_edit  # Inject casual intervention
+            # Run block (assumed to return (x_out, attn_weights))
+            block_out = block(x)
+            if isinstance(block_out, tuple) and len(block_out) == 2:
+                x, attn_w = block_out
+            else:
+                x = block_out
+                attn_w = None
+            if output_attentions_weights and attn_w is not None:
+                attention_weights_per_layer.append(attn_w.detach())
+                if enable_cache:
+                    cache["attn_weights"][L] = attn_w.detach()
+            # Exit-point intervention (rarely used, but handy)
+            if intervention_plan is not None:
+                x = intervention_plan.maybe_replace_resid_post(L, x)
+            if output_hidden_states:
+                hidden_states.append(x.clone())
+            if enable_cache:
+                cache["resid_post"][L] = x.detach()
+        x = self.final_norm(x)
+        logits = self.out_head(x)
+        outputs = (logits,)
+        if enable_cache:
+            outputs += (cache,)
+        if output_hidden_states:
+            outputs += (hidden_states,)
+        if output_attentions_weights:
+            outputs += (attention_weights_per_layer,)
+        return outputs if len(outputs) > 1 else outputs[0]

model_896_14_8_256.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f2ff54930487bdf5583980796eeee05854d761d6926c5a568f8a31b9eaddcc27
+size 2011729853