kokolamba commited on 28 days ago

Commit

f640da0

1 Parent(s): f633391

Update model files

Browse files

Files changed (38) hide show

README.md +3 -0
checkpoint-2700/config.json +43 -0
checkpoint-2700/feedforward.py +196 -0
checkpoint-2700/gla.py +721 -0
checkpoint-2700/merges.txt +0 -0
checkpoint-2700/mla.py +619 -0
checkpoint-2700/optimizer.pt +3 -0
checkpoint-2700/pytorch_model.bin +3 -0
checkpoint-2700/rng_state.pth +3 -0
checkpoint-2700/scheduler.pt +3 -0
checkpoint-2700/shared_space_config.py +329 -0
checkpoint-2700/shared_space_decoder.py +386 -0
checkpoint-2700/special_tokens_map.json +6 -0
checkpoint-2700/task_heads.py +196 -0
checkpoint-2700/tokenizer.json +0 -0
checkpoint-2700/tokenizer_config.json +21 -0
checkpoint-2700/trainer_state.json +1060 -0
checkpoint-2700/training_args.bin +3 -0
checkpoint-2700/vocab.json +0 -0
checkpoint-3000/config.json +43 -0
checkpoint-3000/feedforward.py +196 -0
checkpoint-3000/gla.py +721 -0
checkpoint-3000/merges.txt +0 -0
checkpoint-3000/mla.py +619 -0
checkpoint-3000/optimizer.pt +3 -0
checkpoint-3000/pytorch_model.bin +3 -0
checkpoint-3000/rng_state.pth +3 -0
checkpoint-3000/scheduler.pt +3 -0
checkpoint-3000/shared_space_config.py +329 -0
checkpoint-3000/shared_space_decoder.py +386 -0
checkpoint-3000/special_tokens_map.json +6 -0
checkpoint-3000/task_heads.py +196 -0
checkpoint-3000/tokenizer.json +0 -0
checkpoint-3000/tokenizer_config.json +21 -0
checkpoint-3000/trainer_state.json +1174 -0
checkpoint-3000/training_args.bin +3 -0
checkpoint-3000/vocab.json +0 -0
full_config.json +73 -0

README.md ADDED Viewed

	@@ -0,0 +1,3 @@

+---
+license: mit
+---

checkpoint-2700/config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "architectures": [
+    "SharedSpaceDecoderForCausalLM"
+  ],
+  "attention_backend": "flash_attention_2",
+  "attention_bias": false,
+  "attention_dropout_prob": 0.1,
+  "bos_token_id": 50256,
+  "classifier_dropout": null,
+  "dtype": "float32",
+  "eos_token_id": 50256,
+  "ffn_decompose": false,
+  "ffn_rank": null,
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 2048,
+  "kv_shared_dim": null,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 1024,
+  "model_type": "shared_subspace_decoder",
+  "nope_dims": 32,
+  "norm_type": "rmsnorm",
+  "num_attention_heads": 12,
+  "num_dense_layers": 0,
+  "num_hidden_layers": 12,
+  "o_shared_dim": null,
+  "pad_token_id": 50256,
+  "q_shared_dim": null,
+  "qk_private_dim": 64,
+  "rms_norm_eps": 1e-06,
+  "rope_dims": 32,
+  "rope_scaling": {
+    "factor": 2.0,
+    "type": "linear"
+  },
+  "rope_theta": 10000.0,
+  "transformers_version": "4.56.0",
+  "vo_private_dim": 64,
+  "vocab_rank": null,
+  "vocab_size": 50257,
+  "vocab_subspace": false
+}

checkpoint-2700/feedforward.py ADDED Viewed

	@@ -0,0 +1,196 @@

+"""# ▂▂▂▂▂▂▂▂▂▂▂▂
+# `feedforward.py`
+Regarding dropout:
+- I don't see it applied to the MoE in DeepSeek-V3, [here](https://huggingface.co/deepseek-ai/DeepSeek-R1/blob/main/modeling_deepseek.py).
+- I don't see it applied in [modeling_llama.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L140)
+Norms:
+* nn.RMSNorm [here](https://docs.pytorch.org/docs/stable/generated/torch.nn.RMSNorm.html)
+## FFN
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .shared_space_config import SharedSpaceDecoderConfig
+def create_norm_layer(hidden_size: int, config: SharedSpaceDecoderConfig) -> nn.Module:
+    """
+    Create a normalization layer based on the config norm_type.
+    Args:
+        hidden_size: The dimension to normalize over
+        config: Configuration containing norm_type and epsilon values
+    Returns:
+        Either a LayerNorm or RMSNorm layer
+    """
+    if config.norm_type == "layernorm":
+        return nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
+    elif config.norm_type == "rmsnorm":
+        return DeepseekV3RMSNorm(hidden_size, eps=config.rms_norm_eps)
+    else:
+        # This should be caught by config validation, but being defensive
+        raise ValueError(f"Unknown norm_type: {config.norm_type}")
+# TODO - Find a shared place to put this.
+class DeepseekV3RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        DeepseekV3RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+class SubspaceFeedForward(nn.Module):
+    """
+    Feed-forward block for SharedSpaceDecoder.
+    Implements SwiGLU:
+        FFN(x) = W_out( Swish(W_in(x)) ⊙ W_gate(x) ) + residual
+    Supports both dense and decomposed MLP variants.
+    Dense:
+        - W_in:   Linear(hidden_dim → intermediate_dim)
+        - W_gate: Linear(hidden_dim → intermediate_dim)
+        - W_out:  Linear(intermediate_dim → hidden_dim)
+    Decomposed:
+        - W_in_shared:   Linear(hidden_dim → rank, bias=False)
+        - W_in_shared_norm: RMSNorm
+        - W_in:          Linear(rank → intermediate_dim)
+        - W_gate_shared: Linear(hidden_dim → rank, bias=False)
+        - W_gate_shared_norm: RMSNorm
+        - W_gate:        Linear(rank → intermediate_dim)
+        - W_out:         Linear(intermediate_dim → rank, bias=False)
+        - W_out_shared:  Linear(rank → hidden_dim)
+    Residual, dropout, and post-norm are handled inside the block.
+    """
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        #dropout_prob = config.hidden_dropout_prob # TODO - Style -- don't define variables if only used once.
+        # Determine whether this is a dense or decomposed layer.
+        # It's dense if either:
+        #  - ffn_decompose is disabled (no dense layers at all)
+        #  - ffn_decompose is enabled, but this is one of the early dense layers.
+        self.is_dense = (not config.ffn_decompose) or (layer_idx < config.num_dense_layers)
+        hidden_dim = config.hidden_size
+        intermediate_dim = config.intermediate_size # TODO - Find something shorter, and use the same name.
+        # If it's one of the dense layers,
+        if self.is_dense:
+            # === Dense FFN Projections ===
+            self.W_in = nn.Linear(hidden_dim, intermediate_dim)
+            self.W_gate = nn.Linear(hidden_dim, intermediate_dim)
+            self.W_out = nn.Linear(intermediate_dim, hidden_dim)
+        # Define weights for the decomposed version.
+        else:
+            rank = config.ffn_rank
+            print("hidden_dim:", hidden_dim)
+            print("rank:", rank)
+            # === Input Projections ===
+            self.W_in_shared = nn.Linear(hidden_dim, rank, bias=False)
+            self.W_in_shared_norm = create_norm_layer(rank, config)
+            self.W_in = nn.Linear(rank, intermediate_dim, bias=True)
+            # === Gate Projections ===
+            self.W_gate_shared = nn.Linear(hidden_dim, rank, bias=False)
+            self.W_gate_shared_norm = create_norm_layer(rank, config)
+            self.W_gate = nn.Linear(rank, intermediate_dim, bias=True)
+            # === Output Projection ===
+            self.W_out = nn.Linear(intermediate_dim, rank, bias=False)
+            # TODO - Could experiment with this.
+            #self.W_out_shared_layernorm = DeepseekV3RMSNorm(rank, eps=config.eps)
+            self.W_out_shared = nn.Linear(rank, hidden_dim, bias=True)
+        # See notes no dropout
+        #self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # === Tensor Dimension Symbols ===
+        # B: batch_size     — number of samples in the batch
+        # T: seq_len        — number of tokens per sample
+        # D: hidden_dim     — model embedding size
+        # R: ffn_rank       — latent shared subspace dimension
+        # D_ff: intermediate_size — FFN hidden dimension
+        # =========================
+        #    Gated Feedforward
+        # =========================
+        if self.is_dense:
+            # =============
+            #     Dense
+            # =============
+            # Input:  x [B, T, D]
+            # Output: x_proj [B, T, D_ff]
+            x_proj = self.W_in(x)
+            # Output: gate [B, T, D_ff]
+            gate = self.W_gate(x)
+            # SwiGLU nonlinearity
+            x = F.silu(x_proj) * gate  # [B, T, D_ff]
+            # See notes on dropout
+            #x = self.dropout(x)
+            # Output: x [B, T, D]
+            x = self.W_out(x)
+        else:
+            # ==================
+            #     Decomposed
+            # ==================
+            # Input:  x [B, T, D]
+            # Output: x_proj [B, T, D_ff]
+            x_proj = self.W_in(self.W_in_shared_norm(self.W_in_shared(x)))
+            # Input:  x [B, T, D]
+            # Output: gate [B, T, D_ff]
+            gate = self.W_gate(self.W_gate_shared_norm(self.W_gate_shared(x)))
+            # SwiGLU nonlinearity
+            x = F.silu(x_proj) * gate  # [B, T, D_ff]
+            # See notes on dropout
+            #x = self.dropout(x)
+            # Output: x [B, T, D]
+            x = self.W_out_shared(self.W_out(x))
+        return x

checkpoint-2700/gla.py ADDED Viewed

	@@ -0,0 +1,721 @@

+"""# ▂▂▂▂▂▂▂▂▂▂▂▂
+# `gla.py`
+Based on: https://huggingface.co/deepseek-ai/DeepSeek-R1/blob/main/modeling_deepseek.py
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional
+import math
+from .shared_space_config import SharedSpaceDecoderConfig
+def create_norm_layer(hidden_size: int, config: SharedSpaceDecoderConfig) -> nn.Module:
+    """
+    Create a normalization layer based on the config norm_type.
+    If `hidden_size` is `None`, this returns an identity layer.
+    Args:
+        hidden_size: The dimension to normalize over
+        config: Configuration containing norm_type and epsilon values
+    Returns:
+        Either a LayerNorm or RMSNorm layer
+    """
+    if hidden_size is None:
+        return nn.Identity()
+    elif config.norm_type == "layernorm":
+        return nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
+    elif config.norm_type == "rmsnorm":
+        return DeepseekV3RMSNorm(hidden_size, eps=config.rms_norm_eps)
+    else:
+        # This should be caught by config validation, but being defensive
+        raise ValueError(f"Unknown norm_type: {config.norm_type}")
+# TODO - Find a shared place to put this.
+class DeepseekV3RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        DeepseekV3RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+# Helper function needed because it's called twice during RoPE,
+# but I dumped it in the comments there.
+# TODO - Nah, screw it, just write it twice! At least then you get
+# to use the word 'query' instead of 'x'.
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+class RotaryEmbedding(nn.Module):
+    """Precompute RoPE embeddings and store them as buffers."""
+    def __init__(self, config: SharedSpaceDecoderConfig) -> None:
+        super().__init__()
+        dim = config.rope_dims
+        seq_len = config.max_position_embeddings
+        # ------------------------------
+        # Compute inverse frequencies
+        # ------------------------------
+        # Shape: [dim // 2]
+        #   inv_freq[i] = 1 / (theta^(i / dim))
+        inv_freq = 1.0 / (
+            config.rope_theta
+            ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)
+        )
+        # ------------------------------
+        # Apply RoPE scaling if configured
+        # ------------------------------
+        if config.rope_scaling is not None:
+            scaling_type = config.rope_scaling.get("type", "linear")
+            scaling_factor = config.rope_scaling.get("factor", 1.0)
+            if scaling_type == "linear":
+                # Linear scaling: divide frequencies by scaling factor
+                inv_freq = inv_freq / scaling_factor
+            elif scaling_type == "dynamic":
+                # Dynamic scaling: adjust based on sequence length
+                # This is a simplified implementation
+                inv_freq = inv_freq / scaling_factor
+            else:
+                print(f"Warning: Unknown RoPE scaling type '{scaling_type}', using linear scaling")
+                inv_freq = inv_freq / scaling_factor
+        # ------------------------------
+        # Compute position indices
+        # ------------------------------
+        # Shape: [seq_len]
+        t = torch.arange(seq_len, dtype=torch.float32)
+        # ------------------------------
+        # Outer product: [seq_len, dim // 2]
+        # Each row i contains: t[i] * inv_freq
+        # ------------------------------
+        freqs = torch.outer(t, inv_freq)
+        # ------------------------------
+        # Duplicate for interleaved sin/cos: [seq_len, dim]
+        # This matches the common format: [sin_0, cos_0, sin_1, cos_1, ...]
+        # ------------------------------
+        emb = torch.cat((freqs, freqs), dim=-1)
+        # ------------------------------
+        # Register cos/sin as buffers
+        # - Stored in float32
+        # - Will be moved to correct device/dtype via model.to(...)
+        # - Not saved with state_dict (persistent=False)
+        # ------------------------------
+        self.register_buffer("cos", emb.cos(), persistent=False)
+        self.register_buffer("sin", emb.sin(), persistent=False)
+    def forward(self, position_ids: torch.LongTensor) -> tuple[torch.Tensor, torch.Tensor]:
+        """ """
+        return None # This function is not necessary.
+"""## GLA"""
+class GroupedLatentAttention(nn.Module):
+    """
+    This version of Multihead Latent Attention applies the re-ordering trick from DeepSeekV3.
+    Instead of comparing the queries and keys in the query-key space, we compare them in the
+    kv-shared space.
+    For clarity, I've re-interpreted the naming of the heads, and am framing it as MQA.
+    What were previously labeled the query and key heads are now treated as a low-rank decomposition
+    of the query heads.
+    What we considered the "shared key/value space" is now a single key head that is also used as the
+    value head.
+    Finally, what we previously labeled the value and output heads are now treated as a low-rank
+    decomposition of the output heads.
+    This interpretation / implementation is designed to leverage the performance benefits of GQA.
+    The trade-off is that the query-key matching space is now larger--it will require a greater
+    number of calculations to match the queries to the keys. The hope is that the memory bandwidth
+    savings will outweigh the increased computational cost.
+    The same applies to the value-output space.
+    Note that, although the query-key and value-output spaces are now large, the low-rank
+    decomposition of the query heads and output heads ensures that the heads are still effectively
+    low rank / not over-parameterized.
+    Finally, note that this implementation also supports the optional use of shared spaces on
+    the query and output sides.
+    I've named the class "GroupedLatentAttention" because I may expand it to support multiple
+    key/value heads (i.e., multiple groups of query heads) in the future.
+    ==== Adding RoPE to VO ====
+    ### **Attempt**
+    We're extending Rotary Position Embeddings (RoPE) beyond the query-key interaction to the **value-output path** in Multihead Latent Attention (MLA).
+    * In DeepSeek-V3's MLA framing, the same **full-rank key/value head** provides both the keys (for patterns) and the values (for messages).
+    * Queries and output heads are low-rank bottlenecks, effectively serving as vocabularies of **pattern directions** (Q) and **message directions** (O).
+    * Standard RoPE only modulates the Q–K dot product. Our attempt is to also apply RoPE phases consistently in the V–O pathway, so that **positional dependence is preserved in both the matching (QK) and messaging (VO) sides**.
+    --
+    ### **Hypothesis**
+    If we rotate value vectors by their **source position phase** and then apply the **inverse rotation at the destination** before output projection, the model gains a clean **relative-position equivariance** in the message path, mirroring the property RoPE provides for queries and keys.
+    This should:
+    1. Make the 1-to-1 correspondence between "pattern templates" (Q) and "message templates" (O) more consistent.
+    2. Reduce the burden on output heads to learn ad-hoc positional compensation.
+    3. Improve long-context generalization, since both attention matching *and* message passing would share the same relative-position geometry.
+    """
+    def __init__(self, config: SharedSpaceDecoderConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        # Used to determine if this layer is dense or uses latents.
+        self.layer_idx = layer_idx
+        self.attention_dropout_prob = config.attention_dropout_prob
+        self.num_heads = config.num_attention_heads
+        self.rope_theta = config.rope_theta
+        self.rope_dims = config.rope_dims
+        self.nope_dims = config.nope_dims
+        self.q_shared_dim = config.q_shared_dim
+        # What was previously considered the key/value shared dimension is now the
+        # size of the MQA style single key/value head.
+        self.kv_head_dim = config.kv_shared_dim
+        self.o_shared_dim = config.o_shared_dim
+        # What was previously the query/key head size is now the size of
+        # the query head decomposition.
+        self.q_inner_dim = config.qk_private_dim
+        # What was previously the value/output head size is now the size of
+        # the output head decomposition.
+        self.o_inner_dim = config.vo_private_dim
+        self.hidden_size = config.hidden_size
+        # =========================
+        #     Input Projections
+        # =========================
+        # If this is one of the dense layers,
+        if self.layer_idx < config.num_dense_layers:
+            # =========================
+            #     Dense Attention
+            # =========================
+            # No latent projections.
+            self.latent_spaces = False
+            # Define the standard QKV projection
+            self.qkv_proj = nn.Linear(
+                config.hidden_size,
+                self.num_heads * (self.qk_private_dim * 2 + self.vo_private_dim),
+                bias=config.attention_bias,
+            )
+            # Dense output projection
+            self.o_proj = nn.Linear(
+                self.num_heads * self.vo_private_dim,
+                config.hidden_size,
+                bias=config.attention_bias,
+            )
+        # If we're past the dense layers,
+        else:
+            # =========================
+            #     Latent Attention
+            # =========================
+            # Use latent projections.
+            self.latent_spaces = True
+            # Input latent projections
+            print("config.q_shared_dim", config.q_shared_dim)
+            # ==========================
+            #     Shared Query Space
+            # ==========================
+            # If we're using a shared query subspace,
+            if config.q_shared_dim is not None:
+                # Set a flag that we'll check in `forward`.
+                self.query_shared = True
+                self.q_shared_proj = nn.Linear(
+                    config.hidden_size,
+                    self.q_shared_dim,
+                    bias=config.attention_bias,
+                )
+                self.q_shared_norm = create_norm_layer(self.q_shared_dim, config)
+            else:
+                print("Using identity for shared projection.")
+                # Set a flag that we'll check in `forward`.
+                self.query_shared = False
+                self.q_shared_dim = config.hidden_size
+                #print("Updated self.q_shared_dim to", self.q_shared_dim)
+                # Use identity.
+                self.q_shared_proj = nn.Identity()
+                self.q_shared_norm = nn.Identity()
+            # ==========================
+            #     Shared Output Space
+            # ==========================
+            # If we're using a shared output space,
+            if config.o_shared_dim is not None:
+                # Set a flag that we'll check in `forward`.
+                self.output_shared = True
+                # Shared output projection
+                # The head outputs from `o_private_proj` are first summed together (across
+                # heads) in the latent space.
+                # Then we project their combined outputs (a single vector per token)
+                # back to model space via `o_shared_proj`.
+                self.o_shared_proj = nn.Linear(
+                    self.o_shared_dim,
+                    self.hidden_size,
+                    bias=config.attention_bias
+                )
+                self.o_shared_norm = create_norm_layer(self.o_shared_dim, config)
+            else:
+                # Set a flag that we'll check in `forward`.
+                self.output_shared = False
+                self.o_shared_dim = config.hidden_size
+                # Use identity.
+                self.o_shared_proj = nn.Identity()
+                self.o_shared_norm = nn.Identity()
+            # ================================
+            #      Decomposed Query Heads
+            # ================================
+            # Query down projections.
+            # The query head inner dimension makes the head low rank, as usual.
+            self.q_priv_a_proj = nn.Linear(
+                self.q_shared_dim,
+                self.num_heads * self.q_inner_dim,
+                bias=False
+            )
+            # Query up projections.
+            # We project back to the larger key/value space.
+            # Rather than create a linear and break it apart, we can create our
+            # desired shapes.
+            #  per-head Dq_c -> Dkv     (store as [H, Dq_c, Dkv])
+            self.q_priv_b_weight = nn.Parameter(
+                torch.empty(self.num_heads, self.q_inner_dim, self.kv_head_dim)
+            )
+            nn.init.kaiming_uniform_(self.q_priv_b_weight, a=math.sqrt(5))
+            # ====================================
+            #      Single Joint Key/Value Head
+            # ====================================
+            # The single joint key/value head.
+            self.kv_priv_proj = nn.Linear(
+                self.hidden_size,
+                self.kv_head_dim,
+                bias=False,
+            )
+            self.kv_priv_norm = create_norm_layer(self.kv_head_dim, config)
+            # ================================
+            #      Decomposed Output Heads
+            # ================================
+            # Down: values [B,H,T,Dkv] -> per-head Do_c using weights [H, Dkv, Do_c]
+            self.o_priv_a_weight = nn.Parameter(
+                torch.empty(self.num_heads, self.kv_head_dim, self.o_inner_dim)
+            )
+            nn.init.kaiming_uniform_(self.o_priv_a_weight, a=math.sqrt(5))
+            # Output up projections.
+            # We project back to the larger output subspace (or the model space,
+            # if no subspace is used).
+            self.o_priv_b_proj = nn.Linear(
+                self.num_heads * self.o_inner_dim,
+                self.o_shared_dim,
+                bias=False
+            )
+        # Let SDPA choose 1/sqrt(E). If you want explicit: self.kv_head_dim ** -0.5
+        self.softmax_scale = None
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        #past_key_value: Optional[Cache] = None, # TODO - Can I remove this?
+        #cache_position: Optional[torch.LongTensor] = None, # TODO - Can I remove this?
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        # === Tensor Dimension Symbols ===
+        #     B: batch_size     — number of samples in the batch
+        #     T: seq_len        — number of tokens per sample
+        #     H: n_heads        — number of attention heads
+        #     D: hidden_dim     — model embedding size
+        #  Dq_c: q_inner_dim    - per-head decomposition dim for Q
+        Dq_c = self.q_inner_dim        # per-head inner dim for Q
+        #  Do_c: o_inner_dim    - per-head decomposition dim for O
+        Do_c = self.o_inner_dim        # per-head inner dim for O
+        #   Dkv: kv_head_dim    - Head size of the joint key/value head
+        Dkv  = self.kv_head_dim        # Head size of the joint key/value head
+        #    Dr: rope_dims      - The first Dr dimensions receive rope.
+        #  Dq_s: q_shared_dim   - query shared subspace size
+        Dq_s = self.q_shared_dim
+        #  Do_s: o_shared_dim   - output shared subspace size
+        Do_s = self.o_shared_dim
+        # Input token embeddings
+        # hidden_states: [B, T, D]
+        B, T = hidden_states.shape[:2]
+        H = self.num_heads
+        # =============================
+        #     Shared Query Space
+        # =============================
+        # These are set to identity if no shared query space is used.
+        # Project token embeddings into shared latents
+        # Input:
+        #     hidden_states [B, T, D]
+        #     q_shared_proj [D, Dq_s]
+        #    kv_shared_proj [D, Dkv]
+        # Output:
+        #          q_shared  [B, T, Dq_s]
+        #          kv_shared [B, T, Dkv]
+        q_shared = self.q_shared_proj(hidden_states)
+        # Normalize latent vectors, shapes unchanged.
+        q_shared = self.q_shared_norm(q_shared)
+        # ================================
+        #     Decomposed Query Heads
+        # ================================
+        # Project query latents onto decomposed query heads.
+        #
+        # Down projection ('a')
+        # Input:
+        #     q_shared       [B, T, Dq_s]
+        #     q_priv_a_proj [Dq_s, H*Dq_c]
+        # Output:
+        #     queries_c   [B, T, H*Dq_c]
+        queries_c = self.q_priv_a_proj(q_shared)
+        # Split the vectors by head
+        # Input:
+        #     queries_c        [B, T, H*Dq_c]
+        # Output:
+        #     queries_c   [B, T, H, Dq_c]
+        queries_c = queries_c.view(B, T, H, Dq_c)
+        # Up projection ('b')
+        # Input:
+        #     queries_c        [B, T, H, Dq_c]
+        #     q_priv_b_weight        [H, Dq_c, Dkv]
+        # Output:
+        #     queries     [B, H, T, Dkv]
+        queries = torch.einsum("bthd,hdc->bhtc", queries_c, self.q_priv_b_weight)
+        # ===================================
+        #     Single Joint Key/Value Head
+        # ===================================
+        # Project token embeddings into single joint key/value head.
+        # Input:
+        #     hidden_states [B, T, D]
+        #     kv_priv_proj [D, Dkv]
+        # Output:
+        #     keyvalue [B, T, Dkv]
+        keyvalue = self.kv_priv_proj(hidden_states)
+        # Apply QK normalization.
+        keyvalue = self.kv_priv_norm(keyvalue)
+        # Prepare the queries and keyvalue vectors for RoPE and flash attention.
+        # We have multiple query heads, and the queries are in `queries`.
+        # We have a single key head, and the keyvector is in `keyvalue`.
+        # Move the head dimension to the front, so for each head, we have
+        # a series of vectors for each token in the sequence.
+        #
+        # Inputs:
+        #   keyvalue  [B, T, Dkv]
+        # Output:
+        #   keyvalue   [B, 1, T, Dkv]
+        keyvalue = keyvalue.unsqueeze(1)
+        # ==================
+        #        RoPE
+        # ==================
+        # Apply rotary position embeddings to the first `self.rope_dims` of
+        # each head.
+        # The slice operations are free, but the concatenation is
+        # not, because the outputs of the rotation operation are new data
+        # occupying different memory. Still considered the best option,
+        # though.
+        # 1. Unpack the precomputed cosine and sine embeddings
+        # Position embeddings is a tuple of
+        #    (cos [seq_len, rope_dims],
+        #     sin [seq_len, rope_dims])
+        cos, sin = position_embeddings
+        # 2. Split the query and key heads into the part to rotate and the part
+        #    to pass through (early columns get position info, later ones don't)
+        #
+        #  (Using queries as example)
+        #  Inputs:
+        #    queries  [B, H, T, Dkv]  Dkv = rope_dims + not_rope_dims
+        #  Outputs:
+        #    q_rope   [B, H, T, Dr]
+        #    q_pass   [B, H, T, Dkv-Dr]
+        q_rope, q_pass = queries[..., :self.rope_dims], queries[..., self.rope_dims:]
+        k_rope, k_pass =   keyvalue[..., :self.rope_dims],   keyvalue[..., self.rope_dims:]
+        # 3. Apply the rotary embedding to the designated slice
+        #
+        # To broadcast cos and sin across the batch and head dimensions, we unsqueeze them.
+        # Shape change: [T, Dr] -> [1, 1, T, Dr]
+        cos = cos.unsqueeze(0).unsqueeze(0)
+        sin = sin.unsqueeze(0).unsqueeze(0)
+        #print("q_rope.shape[-1] // 2:", (q_rope.shape[-1] // 2))
+        #print("x1 = x[..., :x.shape[-1] // 2 ].shape:", q_rope[..., :q_rope.shape[-1] // 2 ].shape)
+        #print("sin/cos.shape:", cos.shape)
+        #print("q_rope.shape:", q_rope.shape)
+        #print("(q_rope * cos).shape:", (q_rope * cos).shape)
+        #print("rotate_half(q_rope).shape:", rotate_half(q_rope).shape)
+        #print("(rotate_half(q_rope) * sin).shape:", (rotate_half(q_rope) * sin).shape)
+        """
+        In this example   batch_size = 2, hum_heads = 8, seq_len = 65, rope_dims = 16
+                        q_rope.shape[-1] // 2: 8
+        x1 = x[..., :x.shape[-1] // 2 ].shape: torch.Size([2, 8, 65, 8])
+                    sin/cos.shape: torch.Size([1, 1, 65, 16])  # After double unsqueeze.
+                    vq_rope.shape: torch.Size([2, 8, 65, 16])
+             (q_rope * cos).shape: torch.Size([2, 8, 65, 16])
+        rotate_half(q_rope).shape: torch.Size([2, 8, 65, 16])
+        (rotate_half(q_rope) * sin).shape: torch.Size([2, 8, 65, 16])
+        """
+        # Let's walk through the queries as the example.
+        # What does rotate half do?
+        #    dim -1 is the row vectors, the queries
+        #
+        #  Step 1: Split the vector in half.
+        #    "q_rope.shape[-1] // 2" <- How much to select. Half the length of the q_rope vector
+        #    x1 = x[..., :x.shape[-1] // 2 ]  # Select the first half of the vector.
+        #    x2 = x[...,  x.shape[-1] // 2:]  # Select the second half.
+        #
+        #  Step 2:
+        #      - Apply negative to the values in the second half.
+        #      - Reverse the order of the halves.
+        #    return torch.cat((-x2, x1), dim=-1)
+        #
+        # ---- (q_rope * cos) ----
+        # Element-wise multiply the values in each `cos` vector with the
+        # corresponding (i.e., same sequence position) `q_rope` vector.
+        #
+        # Inputs:
+        #    q_rope  [B, H, T, Dr]
+        #       cos  [1, 1, T, Dr]
+        #
+        # Outputs:
+        #        x   [B, H, T, Dr]
+        #
+        # ---- (rotate_half(q_rope)) ----
+        #  TODO
+        #
+        # Inputs:
+        #       q_rope    [B, T, Dr]
+        #
+        # Outputs:
+        #   rot_q_rope    [B, T, Dr]
+        #
+        # ---- rotated * sin ----
+        #  TODO
+        q_rotated = (q_rope * cos) + (rotate_half(q_rope) * sin)
+        k_rotated = (k_rope * cos) + (rotate_half(k_rope) * sin)
+        # 4. Concatenate the rotated and pass-through parts back together
+        # Input (each): [B, H, T, Dr] and [B, H, T, Dkv-Dr]
+        # Output (each): [B, H, T, Dkv]
+        # (Where h = 1 for the key head and h = num_heads for the query heads)
+        queries = torch.cat((q_rotated, q_pass), dim=-1)
+        keyvalue = torch.cat((k_rotated, k_pass), dim=-1)
+        # ====================
+        #      GQA / MQA
+        # ====================
+        # GPT says that flash attention will infer the broadcasting, so `expand` is not needed.
+        #
+        # We need to use the `expand` operation to broadcast the keyvalue vector
+        # across the query heads.
+        # Input:
+        #     keyvalue [B, 1, T, Dkv]
+        # Output:
+        #     keyvalue [B, H, T, Dkv]
+        #keyvalue = keyvalue.expand(-1, H, -1, -1)
+        # ===================
+        #       Attention
+        # ===================
+        # We're ready for the attention score calculation.
+        # Only apply dropout during training.
+        # self.training is a pytorch flag.
+        if self.training:
+            dropout_p = self.attention_dropout_prob
+        else:
+            dropout_p = 0.0
+        # Call SDPA / Flash Attention
+        # https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
+        # Apply MQA / GQA. In this case, we have a single key head, and multiple query heads.
+        values = F.scaled_dot_product_attention(
+            queries,
+            keyvalue, # Single key vector (joint with value) for GQA / MQA.
+            keyvalue, # Single value vector (joint with key) for GQA / MQA.
+            attn_mask=None, # attention_mask,
+            dropout_p=dropout_p,
+            scale=self.softmax_scale,
+            is_causal=True, # This is a decoder - apply causal masking
+        )
+        # Attention outputs:
+        # values [B, H, T, Dkv]
+        # The final Dr dims of the value vectors carry RoPE information.
+        # We can either (1) add position dependence to the value-output process,
+        # or (2) we can strip off the RoPE information and only use the non-RoPE parts.
+        # Let's try option 1!
+        # Split the values into the RoPE and non-RoPE parts.
+        # Input:
+        #     values [B, H, T, Dkv]
+        # Output:
+        #     values_rope [B, H, T, Dr]
+        #     values_pass [B, H, T, Dkv-Dr]
+        values_rope, values_pass = values[..., :self.rope_dims], values[..., self.rope_dims:]
+        # Fold the query RoPE information into the value vectors.
+        # Inverse rotation: R_{-θ} x  =  (x * cos)  - (rotate_half(x) * sin)
+        # Input:
+        #     values_rope [B, H, T, Dr]
+        #            cos  [1, 1, T, Dr]
+        #            sin  [1, 1, T, Dr]
+        # Output:
+        #     values_unrot [B, H, T, Dr]
+        values_unrot = (values_rope * cos) - (rotate_half(values_rope) * sin)
+        # Now the values have the offset information in their rope dimensions,
+        # and the output heads can learn to use it.
+        values = torch.cat((values_unrot, values_pass), dim=-1)  # [B,H,T,Dkv]
+        # =========================
+        #     Output Projection
+        # =========================
+        # Project the values onto the decomposed output heads.
+        # Output down projection heads.
+        # Input:
+        #            values  [B, H, T, Dkv]
+        #   o_priv_a_weight     [H, Dkv, Do_c]
+        # Output:
+        #         outputs_c  [B, H, T, Do_c]
+        outputs_c = torch.einsum("bhtd,hdc->bhtc", values, self.o_priv_a_weight)
+        # For the up projection, we can concatenate the 'outputs_c' vectors by head,
+        # (in the same way we would usually concatenate the value vectors)
+        # Input:
+        #    outputs_c  [B, H, T, Do_c]
+        # Output:
+        #   outputs_c  [B, T, H*Do_c]
+        outputs_c = outputs_c.permute(0, 2, 1, 3).contiguous().view(B, T, H * Do_c)
+        # Project up to the shared output space and sum across the output heads.
+        # Input:
+        #    outputs_c  [B, T, H*Do_c]
+        #    o_priv_b_proj [H*Do_c, Do_s]
+        # Output:
+        #    output_s  [B, T, Do_s]
+        output_s = self.o_priv_b_proj(outputs_c)
+        # Apply normalization to the output latents
+        output_s = self.o_shared_norm(output_s)
+        # Re-project the output latent representation back to model space.
+        # Input:
+        #    output_s      [B, T, Do_s]
+        #    o_shared_proj [Do_s, D]
+        # Output:
+        #    attn_output   [B, T, D]
+        attn_output = self.o_shared_proj(output_s)
+        # TODO - Not currently supported.
+        # If this is a dense layer,
+        # Project the values back into model space.
+        # attn_output = self.o_proj(attn_output)
+        # -----------------------------------------
+        return attn_output

checkpoint-2700/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-2700/mla.py ADDED Viewed

	@@ -0,0 +1,619 @@

+"""# ▂▂▂▂▂▂▂▂▂▂▂▂
+# `mla.py`
+Based on: https://huggingface.co/deepseek-ai/DeepSeek-R1/blob/main/modeling_deepseek.py
+## RotaryEmbedding
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional
+from .shared_space_config import SharedSpaceDecoderConfig
+def create_norm_layer(hidden_size: int, config: SharedSpaceDecoderConfig) -> nn.Module:
+    """
+    Create a normalization layer based on the config norm_type.
+    If `hidden_size` is `None`, this returns an identity layer.
+    Args:
+        hidden_size: The dimension to normalize over
+        config: Configuration containing norm_type and epsilon values
+    Returns:
+        Either a LayerNorm or RMSNorm layer
+    """
+    if hidden_size is None:
+        return nn.Identity()
+    elif config.norm_type == "layernorm":
+        return nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
+    elif config.norm_type == "rmsnorm":
+        return DeepseekV3RMSNorm(hidden_size, eps=config.rms_norm_eps)
+    else:
+        # This should be caught by config validation, but being defensive
+        raise ValueError(f"Unknown norm_type: {config.norm_type}")
+# TODO - Find a shared place to put this.
+class DeepseekV3RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        DeepseekV3RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+# Helper function needed because it's called twice during RoPE,
+# but I dumped it in the comments there.
+# TODO - Nah, screw it, just write it twice! At least then you get
+# to use the word 'query' instead of 'x'.
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+class RotaryEmbedding(nn.Module):
+    """Precompute RoPE embeddings and store them as buffers."""
+    def __init__(self, config: SharedSpaceDecoderConfig) -> None:
+        super().__init__()
+        dim = config.rope_dims
+        seq_len = config.max_position_embeddings
+        # ------------------------------
+        # Compute inverse frequencies
+        # ------------------------------
+        # Shape: [dim // 2]
+        #   inv_freq[i] = 1 / (theta^(i / dim))
+        inv_freq = 1.0 / (
+            config.rope_theta
+            ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)
+        )
+        # ------------------------------
+        # Apply RoPE scaling if configured
+        # ------------------------------
+        if config.rope_scaling is not None:
+            scaling_type = config.rope_scaling.get("type", "linear")
+            scaling_factor = config.rope_scaling.get("factor", 1.0)
+            if scaling_type == "linear":
+                # Linear scaling: divide frequencies by scaling factor
+                inv_freq = inv_freq / scaling_factor
+            elif scaling_type == "dynamic":
+                # Dynamic scaling: adjust based on sequence length
+                # This is a simplified implementation
+                inv_freq = inv_freq / scaling_factor
+            else:
+                print(f"Warning: Unknown RoPE scaling type '{scaling_type}', using linear scaling")
+                inv_freq = inv_freq / scaling_factor
+        # ------------------------------
+        # Compute position indices
+        # ------------------------------
+        # Shape: [seq_len]
+        t = torch.arange(seq_len, dtype=torch.float32)
+        # ------------------------------
+        # Outer product: [seq_len, dim // 2]
+        # Each row i contains: t[i] * inv_freq
+        # ------------------------------
+        freqs = torch.outer(t, inv_freq)
+        # ------------------------------
+        # Duplicate for interleaved sin/cos: [seq_len, dim]
+        # This matches the common format: [sin_0, cos_0, sin_1, cos_1, ...]
+        # ------------------------------
+        emb = torch.cat((freqs, freqs), dim=-1)
+        # ------------------------------
+        # Register cos/sin as buffers
+        # - Stored in float32
+        # - Will be moved to correct device/dtype via model.to(...)
+        # - Not saved with state_dict (persistent=False)
+        # ------------------------------
+        self.register_buffer("cos", emb.cos(), persistent=False)
+        self.register_buffer("sin", emb.sin(), persistent=False)
+    def forward(self, position_ids: torch.LongTensor) -> tuple[torch.Tensor, torch.Tensor]:
+        """ """
+        return None # This function is not necessary.
+"""## MLA"""
+class MultiheadLatentAttention(nn.Module):
+    """
+    A variant of MLA with:
+    - Simplified RoPE handling:
+      - A portion of the head dimensions are used for position information.
+      - Same number of queries as keys. (no MQA)
+    - Optional output subspace
+    """
+    def __init__(self, config: SharedSpaceDecoderConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        # Used to determine if this layer is dense or uses latents.
+        self.layer_idx = layer_idx
+        self.attention_dropout_prob = config.attention_dropout_prob
+        self.num_heads = config.num_attention_heads
+        self.rope_theta = config.rope_theta
+        self.rope_dims = config.rope_dims
+        self.nope_dims = config.nope_dims
+        self.q_shared_dim = config.q_shared_dim
+        self.kv_shared_dim = config.kv_shared_dim
+        self.o_shared_dim = config.o_shared_dim
+        self.qk_private_dim = config.qk_private_dim
+        self.vo_private_dim = config.vo_private_dim
+        self.hidden_size = config.hidden_size
+        # =========================
+        #     Input Projections
+        # =========================
+        # If this is one of the dense layers,
+        if self.layer_idx < config.num_dense_layers:
+            # =========================
+            #     Dense Attention
+            # =========================
+            # No latent projections.
+            self.latent_spaces = False
+            # Define the standard QKV projection
+            self.qkv_proj = nn.Linear(
+                config.hidden_size,
+                self.num_heads * (self.qk_private_dim * 2 + self.vo_private_dim),
+                bias=config.attention_bias,
+            )
+            # Dense output projection
+            self.o_proj = nn.Linear(
+                self.num_heads * self.vo_private_dim,
+                config.hidden_size,
+                bias=config.attention_bias,
+            )
+        # If we're past the dense layers,
+        else:
+            # =========================
+            #     Latent Attention
+            # =========================
+            # Use latent projections.
+            self.latent_spaces = True
+            # Input latent projections
+            print("config.q_shared_dim", config.q_shared_dim)
+            # If we're using a shared query subspace,
+            if config.q_shared_dim is not None:
+                # Set a flag that we'll check in `forward`.
+                self.query_shared = True
+                self.q_shared_proj = nn.Linear(
+                    config.hidden_size,
+                    self.q_shared_dim,
+                    bias=config.attention_bias,
+                )
+                self.q_shared_norm = create_norm_layer(self.q_shared_dim, config)
+            else:
+                print("Using identity for shared projection.")
+                # Set a flag that we'll check in `forward`.
+                self.query_shared = False
+                self.q_shared_dim = config.hidden_size
+                #print("Updated self.q_shared_dim to", self.q_shared_dim)
+                # Use identity.
+                self.q_shared_proj = nn.Identity()
+                self.q_shared_norm = nn.Identity()
+            # If we're using a shared key/value subspace,
+            if config.kv_shared_dim is not None:
+                # Set a flag that we'll check in `forward`.
+                self.keyvalue_shared = True
+                self.kv_shared_proj = nn.Linear(
+                    config.hidden_size,
+                    self.kv_shared_dim,
+                    bias=config.attention_bias,
+                )
+                self.kv_shared_norm = create_norm_layer(self.kv_shared_dim, config)
+            else:
+                # Set a flag that we'll check in `forward`.
+                self.keyvalue_shared = False
+                self.kv_shared_dim = config.hidden_size
+                # Use identity.
+                self.kv_shared_proj = nn.Identity()
+                self.kv_shared_norm = nn.Identity()
+            #print("config.q_shared_dim", config.q_shared_dim)
+            #print("self.qk_private_dim", self.qk_private_dim)
+            # Query heads
+            self.q_private_proj = nn.Linear(
+                self.q_shared_dim,
+                self.num_heads * self.qk_private_dim,
+                bias=False # TODO
+            )
+            # Key and Value heads, concatenated
+            self.kv_private_proj = nn.Linear(
+                self.kv_shared_dim,
+                self.num_heads * (self.qk_private_dim + self.vo_private_dim),
+                bias=False,
+            )
+            # Use output subspace if o_shared_dim is specified
+            self.output_subspace = config.o_shared_dim is not None
+            # If we're using an output subspace,
+            if self.output_subspace:
+                # ==========================
+                #     Output Subspace
+                # ==========================
+                self.o_shared_dim = config.o_shared_dim
+                # Per-head output projections
+                # (Similar to original W^O, but projects the scored value vectors
+                #  into a latent space instead of back to the model)
+                self.o_private_proj = nn.Linear(
+                    self.num_heads * self.vo_private_dim,
+                    self.o_shared_dim,
+                    bias=False
+                )
+                # Norm layer between o_private_proj and o_shared_proj
+                # Note: In previous ViT experiments, this norm step hurt performance, but was beneficial
+                #       in the DeepSeekV3 experiments.
+                # However, we're making it configurable so it can be tested in different contexts.
+                self.o_private_norm = create_norm_layer(self.o_shared_dim, config)
+                # Shared output projection
+                # The head outputs from `o_private_proj` are first summed together (across
+                # heads) in the latent space.
+                # Then we project their combined outputs (a single vector per token)
+                # back to model space via `o_shared_proj`.
+                self.o_shared_proj = nn.Linear(
+                    self.o_shared_dim,
+                    self.hidden_size,
+                    bias=config.attention_bias
+                )
+            else:
+                # Dense output projection
+                self.o_proj = nn.Linear(
+                    self.num_heads * self.vo_private_dim,
+                    config.hidden_size,
+                    bias=config.attention_bias,
+                )
+        # Softmax scaling factor.
+        self.softmax_scale = self.qk_private_dim ** (-0.5)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        #past_key_value: Optional[Cache] = None, # TODO - Can I remove this?
+        #cache_position: Optional[torch.LongTensor] = None, # TODO - Can I remove this?
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        # === Tensor Dimension Symbols ===
+        #    B: batch_size     — number of samples in the batch
+        #    T: seq_len        — number of tokens per sample
+        #    H: n_heads        — number of attention heads
+        #    D: hidden_dim     — model embedding size
+        #   Dv: vo_private_dim - per-head value/output projection dimension
+        #   Dr: rope_dims      - The first Dr dimensions receive rope.
+        #   Cq: q_shared_dim   - query shared subspace size
+        #  Ckv: kv_shared_dim  - key-value shared subspace size
+        #   Co: o_shared_dim   - output shared subspace size
+        # Input token embeddings
+        # hidden_states: [B, T, D]
+        B, T = hidden_states.shape[:2]
+        H = self.num_heads
+        Dq = self.qk_private_dim     # per-head dim for Q and K
+        Dv = self.vo_private_dim     # per-head dim for V/O
+        Dc_q, Dc_kv = self.q_shared_dim, self.kv_shared_dim
+        # ==============================
+        #      QKV Head Projections
+        # ==============================
+        # Project tokens into per-head query, key, and value vectors
+        # If this layer uses latent projections,
+        if self.latent_spaces:
+            # ================================
+            #     Shared Space Projections
+            # ================================
+            # Project token embeddings into shared latents
+            # Input:
+            #     hidden_states [B, T, D]
+            #     q_shared_proj [D, Cq]
+            #    kv_shared_proj [D, Ckv]
+            # Output:
+            #          q_shared  [B, T, Cq]
+            #          kv_shared [B, T, Ckv]
+            # If we're using a shared query subspace,
+            if self.q_shared_dim is not None:
+                q_shared = self.q_shared_proj(hidden_states)
+                # Normalize latent vectors, shapes unchanged.
+                q_shared = self.q_shared_norm(q_shared)
+            # Otherwise,
+            else:
+                # Use the hidden states
+                q_shared = hidden_states
+            # If we're using a shared key/value subspace,
+            if self.kv_shared_dim is not None:
+                # Project token embeddings into shared subspace.
+                kv_shared = self.kv_shared_proj(hidden_states)
+                # Normalize latent vectors, shapes unchanged.
+                kv_shared = self.kv_shared_norm(kv_shared)
+            # Otherwise,
+            else:
+                # Use the hidden states
+                kv_shared = hidden_states
+            # ======================================
+            #     Per-Head (Private) Projections
+            # ======================================
+            # Project query latents onto query heads.
+            # Input:
+            #     q_shared       [B, T, Cq]
+            #     q_private_proj [Cq, H*Dh]
+            # Output:
+            #     queries   [B, T, H*Dh]
+            queries = self.q_private_proj(q_shared)
+            # Project key/value latents onto key and value heads.
+            # The key and value heads are all concatenated, each head occupies
+            # Dh columns of the kv_private_proj. This yields the key and value
+            # vectors concatenated in the same way.
+            #
+            # Input:
+            #          kv_shared [B, T, Ckv]
+            #    kv_private_proj [Ckv, 2*H*Dh]
+            # Output:
+            #     keysvalues [B, T, 2*H*Dh]
+            keysvalues = self.kv_private_proj(kv_shared)
+            # Split into key and value tensors
+            # Each: [B, T, H * Dh]
+            keys, values = keysvalues.chunk(2, dim=-1)
+        # If this is a dense attention layer (no latent projections),
+        else:
+            # ====================
+            #     Standard MHA
+            # ====================
+            # Standard QKV projection
+            # Input:
+            #   hidden_states     [B, T, D]
+            #         qkv_proj    [D, 3*H*Dh]
+            # Output:
+            #   querieskeysvalues [B, T, 3*H*Dh]
+            querieskeysvalues = self.qkv_proj(hidden_states)
+            # Separate query, key, and value vectors
+            # Each: [B, T, H * Dh]
+            queries, keys, values = querieskeysvalues.chunk(3, dim=-1)
+        # Split up queries so that there's just one per row.
+        # Same for keys and values.
+        #
+        # Inputs:
+        #   Each  [B, T, H*Dh]
+        # Output:
+        #   Each  [B, H,  T,  Dh]
+        queries = queries.view(B, T, H, Dq).transpose(1, 2)
+        keys =       keys.view(B, T, H, Dq).transpose(1, 2)
+        values =   values.view(B, T, H, Dv).transpose(1, 2)
+        # ==================
+        #        RoPE
+        # ==================
+        # Apply rotary position embeddings to the first `self.rope_dims` of
+        # each head.
+        # The slice operations are free, but the concatenation is
+        # not, because the outputs of the rotation operation are new data
+        # occupying different memory. Still considered the best option,
+        # though.
+        # 1. Unpack the precomputed cosine and sine embeddings
+        # Position embeddings is a tuple of
+        #    (cos [seq_len, rope_dims],
+        #     sin [seq_len, rope_dims])
+        cos, sin = position_embeddings
+        # 2. Split the query and key heads into the part to rotate and the part
+        #    to pass through (early columns get position info, later ones don't)
+        #
+        #  (Using queries as example)
+        #  Inputs:
+        #    queries  [B, H, T, Dh]  Dh = rope_dims + not_rope_dims
+        #  Outputs:
+        #    q_rope   [B, H, T,  Dr]
+        #    q_pass   [B, H, T, Dh-Dr]
+        q_rope, q_pass = queries[..., :self.rope_dims], queries[..., self.rope_dims:]
+        k_rope, k_pass =    keys[..., :self.rope_dims],    keys[..., self.rope_dims:]
+        # 3. Apply the rotary embedding to the designated slice
+        #
+        # To broadcast cos and sin across the batch and head dimensions, we unsqueeze them.
+        # Shape change: [T, Dr] -> [1, 1, T, Dr]
+        cos = cos.unsqueeze(0).unsqueeze(0)
+        sin = sin.unsqueeze(0).unsqueeze(0)
+        #print("q_rope.shape[-1] // 2:", (q_rope.shape[-1] // 2))
+        #print("x1 = x[..., :x.shape[-1] // 2 ].shape:", q_rope[..., :q_rope.shape[-1] // 2 ].shape)
+        #print("sin/cos.shape:", cos.shape)
+        #print("q_rope.shape:", q_rope.shape)
+        #print("(q_rope * cos).shape:", (q_rope * cos).shape)
+        #print("rotate_half(q_rope).shape:", rotate_half(q_rope).shape)
+        #print("(rotate_half(q_rope) * sin).shape:", (rotate_half(q_rope) * sin).shape)
+        """
+        In this example   batch_size = 2, hum_heads = 8, seq_len = 65, rope_dims = 16
+                        q_rope.shape[-1] // 2: 8
+        x1 = x[..., :x.shape[-1] // 2 ].shape: torch.Size([2, 8, 65, 8])
+                    sin/cos.shape: torch.Size([1, 1, 65, 16])  # After double unsqueeze.
+                    vq_rope.shape: torch.Size([2, 8, 65, 16])
+             (q_rope * cos).shape: torch.Size([2, 8, 65, 16])
+        rotate_half(q_rope).shape: torch.Size([2, 8, 65, 16])
+        (rotate_half(q_rope) * sin).shape: torch.Size([2, 8, 65, 16])
+        """
+        # Let's walk through the queries as the example.
+        # What does rotate half do?
+        #    dim -1 is the row vectors, the queries
+        #
+        #  Step 1: Split the vector in half.
+        #    "q_rope.shape[-1] // 2" <- How much to select. Half the length of the q_rope vector
+        #    x1 = x[..., :x.shape[-1] // 2 ]  # Select the first half of the vector.
+        #    x2 = x[...,  x.shape[-1] // 2:]  # Select the second half.
+        #
+        #  Step 2:
+        #      - Apply negative to the values in the second half.
+        #      - Reverse the order of the halves.
+        #    return torch.cat((-x2, x1), dim=-1)
+        #
+        # ---- (q_rope * cos) ----
+        # Element-wise multiply the values in each `cos` vector with the
+        # corresponding (i.e., same sequence position) `q_rope` vector.
+        #
+        # Inputs:
+        #    q_rope  [B, H, T, Dr]
+        #       cos  [1, 1, T, Dr]
+        #
+        # Outputs:
+        #        x   [B, H, T, Dr]
+        #
+        # ---- (rotate_half(q_rope)) ----
+        #  TODO
+        #
+        # Inputs:
+        #       q_rope    [B, T, Dr]
+        #
+        # Outputs:
+        #   rot_q_rope    [B, T, Dr]
+        #
+        # ---- rotated * sin ----
+        #  TODO
+        q_rotated = (q_rope * cos) + (rotate_half(q_rope) * sin)
+        k_rotated = (k_rope * cos) + (rotate_half(k_rope) * sin)
+        # 4. Concatenate the rotated and pass-through parts back together
+        # Input (each): [B, H, T, Dr] and [B, H, T, Dq-Dr]
+        # Output (each): [B, H, T, Dq]
+        queries = torch.cat((q_rotated, q_pass), dim=-1)
+        keys = torch.cat((k_rotated, k_pass), dim=-1)
+        # ===================
+        #       Attention
+        # ===================
+        # The tensors (queries, keys, values) now have shape [B, H, T, Dq]
+        # and are ready for the attention score calculation.
+        # Only apply dropout during training.
+        # self.training is a pytorch flag.
+        if self.training:
+            dropout_p = self.attention_dropout_prob
+        else:
+            dropout_p = 0.0
+        # Call SDPA / Flash Attention
+        # https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
+        attn_output = F.scaled_dot_product_attention(
+            queries,
+            keys,
+            values,
+            attn_mask=None, # attention_mask,
+            dropout_p=dropout_p,
+            scale=self.softmax_scale,
+            is_causal=True, # This is a decoder - apply causal masking
+        )
+        # Reshape output back to [B, T, H * Dv] from [B, H, T, Dv]
+        attn_output = attn_output.transpose(1, 2).contiguous().view(B, T, H * Dv)
+        # =========================
+        #     Output Projection
+        # =========================
+        # If we are using an output latent projection,
+        if self.latent_spaces and self.output_subspace:
+            # Project the attention output into the output latent space.
+            # This is analogous to the W^O matrix in standard attention but
+            # projects to an intermediate latent dimension.
+            attn_output = self.o_private_proj(attn_output)
+            # Apply normalization to the output latents
+            attn_output = self.o_private_norm(attn_output)
+            # Re-project the output latent representation back to model space.
+            attn_output = self.o_shared_proj(attn_output)
+        # If this is a dense layer,
+        else:
+            # Project the values back into model space.
+            attn_output = self.o_proj(attn_output)
+        # -----------------------------------------
+        return attn_output

checkpoint-2700/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5f7a1430fc97b046dd29e562728ef516f253dba0862a4e8159c3b0f62449c3ba
+size 988989899

checkpoint-2700/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb5ec96e21b65268860c6e701fc4420d405a47a2a6ccf3b86e30f1c05dcca018
+size 494483579

checkpoint-2700/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3c35fc70182e7ca9c2fecd3307516287efe33bd69963aae42900e793d8582f9b
+size 14645

checkpoint-2700/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3c42329fe9ce60834aecb564501f655ead9473db295ff06d83d914a3e71fedd4
+size 1465

checkpoint-2700/shared_space_config.py ADDED Viewed

	@@ -0,0 +1,329 @@

+"""# `shared_space_config.py`
+#### `*Config`
+"""
+from typing import Optional
+import torch
+from torch import nn
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_utils import PreTrainedModel
+"""`def make_shorthand`"""
+def make_shorthand(model_cfg):
+    """
+    Takes an instance subencoder `*Config` and constructs a shorthand
+    name for the model based on settings.
+    """
+    dense_str = str(model_cfg.num_dense_layers) + "mha + "
+    if model_cfg.o_shared_dim is not None:
+        o_str = "." + str(model_cfg.o_shared_dim)
+    else:
+        o_str = ""
+    # If no output subspace is used, the dimension will show as -1.
+    attn_str = (
+        dense_str
+        + "mla."
+        + str(model_cfg.q_shared_dim)
+        + "."
+        + str(model_cfg.kv_shared_dim)
+        + o_str
+    )
+    # MLP Configuration
+    if model_cfg.ffn_decompose:
+        dense_str = (
+            str(model_cfg.num_dense_layers)
+            + "mlp."
+            + str(model_cfg.intermediate_size)
+            + " + "
+        )
+        mlp_str = (
+            dense_str
+            + str(model_cfg.num_hidden_layers - model_cfg.num_dense_layers)
+            + "dcmp."
+            + "x"
+            + str(model_cfg.intermediate_size)
+            + "."
+            + str(model_cfg.ffn_rank)
+        )
+    else:
+        mlp_str = "mlp." + str(model_cfg.intermediate_size)
+    # Assemble string
+    shorthand = (
+        f"{attn_str} - {mlp_str} - "
+        f"h{model_cfg.hidden_size} - l{model_cfg.num_hidden_layers}"
+    )
+    """
+    The run name includes training settings
+    run_name = (
+        f"{config['stats']['total_elements']} - "
+        f"{attn_str} - {mlp_str} - "
+        f"h{model_cfg.hidden_size} - l{model_cfg.num_hidden_layers} - "
+        f"bs{ptrain_cfg['train_batch_size']} - lr{lr_str} - "
+        f"seq{ptrain_cfg['max_seq_length']}"
+    )
+    """
+    return shorthand
+class SharedSpaceDecoderConfig(PretrainedConfig):
+    r"""
+    Configuration class for SharedSpaceDecoderConfig.
+    Extends the HuggingFace `PretrainedConfig` to support architectural
+    variations including:
+    - Multi-Head Latent Attention (MLA)
+    - Decomposed MLPs (low-rank FFNs)
+    - Flexible attention backends (eager, flash, sdpa)
+    - Explicit shared subspaces for Q, K, V, and O projections
+    This config does not infer any defaults based on `hidden_size`. All
+    dimensions and ranks must be explicitly specified. If required values are
+    missing, a `ValueError` is raised during initialization.
+    ----------------------
+    Core Model Parameters:
+    ----------------------
+    - vocab_size (`int`) — Vocabulary size.
+    - hidden_size (`int`) — Model hidden dimension.
+    - num_hidden_layers (`int`) — Number of transformer blocks.
+    - intermediate_size (`int`) — Feed-forward hidden dimension.
+    - hidden_act (`str`) — Activation function.
+    - hidden_dropout_prob (`float`) — Dropout after projections and FFNs.
+    - attention_dropout_prob (`float`) — Dropout applied to attention scores.
+    - max_position_embeddings (`int`) — Max sequence length.
+    - initializer_range (`float`) — Stddev of weight init.
+    - layer_norm_eps (`float`) — Epsilon for LayerNorm.
+    - rms_norm_ps (`float`) — Epsilon for RMSNorm
+    - classifier_dropout (`float` or None) — Dropout for final classifier.
+    - vocab_subspace
+    - vocab_rank
+    ----------------------------------
+    Multi-Head Latent Attention (MLA):
+    ----------------------------------
+    - num_attention_heads (`int`) — Number of attention heads.
+    - q_shared_dim (`int`) — Rank of the shared query subspace.
+    - kv_shared_dim (`int`) — Rank of the shared key/value subspace.
+    - output_subspace (`bool`) — Whether to use a shared latent subspace for output projections.
+    - o_shared_dim (`int`) — Rank of the shared output subspace (required if `output_subspace=True`).
+    - qk_private_dim (`int`) — Query/key private dimension per head.
+    - vo_private_dim (`int`) — Value/output private dimension per head.
+    - rope_dims (`int`) — Number of head dimensions carrying RoPE.
+    - nope_dims (`int`) — Non-positional encoding dimensions.
+    - rope_theta (`float`) — Base frequency used for RoPE.
+    - rope_scaling (`dict` or None) — HF-style scaling dict for RoPE.
+    - attention_bias (`bool`) — Whether to include bias terms in Q/K/V projections.
+    - num_dense_layers (`int`) — Number of leading layers that do not use
+                                 subspaces for attention or FFNs.
+    - attention_backend (`str`) — Must be one of `"eager"`, `"flash_attention_2"`, or `"sdpa"`.
+    ----------------------
+    Decomposed MLP (Low-Rank FFN):
+    ----------------------
+    - ffn_decompose (`bool`) — Whether to enable low-rank FFNs.
+    - ffn_rank (`int`) — Rank of the shared FFN latent space (required if `ffn_decompose=True`).
+    ----------------------
+    Validation Behavior:
+    ----------------------
+    Raises `ValueError` at init time if:
+    - FFN decomposition is enabled without specifying `ffn_rank`.
+    - An unknown `attention_backend` is provided.
+    """
+    model_type = "shared_subspace_decoder"
+    def __init__(
+        self,
+        # === Core Model ===
+        vocab_size:         int = 30522,
+        hidden_size:        int = 512,
+        num_hidden_layers:  int = 12,
+        intermediate_size:  int = 3072,
+        hidden_dropout_prob=0.1,
+        attention_dropout_prob=0.1,
+        max_position_embeddings: int = 2048,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        rms_norm_eps=1e-6, # Their default, but confirm in config.
+        norm_type="layernorm", # Choice between "layernorm" and "rmsnorm"
+        classifier_dropout=None,
+        vocab_subspace=False,
+        vocab_rank=None,
+        tie_word_embeddings=True,
+        # === Multi-Head Latent Attention ===
+        num_attention_heads: int = 16,
+        rope_dims:           int = 16,
+        q_shared_dim:        int = None,
+        kv_shared_dim:       int = None,
+        o_shared_dim=None,  # If None, no output subspace is used
+        # Private head dimensions
+        qk_private_dim:      int = None,  # Query/key private dimension per head
+        vo_private_dim:      int = None,  # Value/output private dimension per head
+        nope_dims:           int = None,  # Non-positional encoding dimensions
+        attention_backend="eager",
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        # === MLA Composition ===
+        num_dense_layers=12,  # dense MHA layers before MLA starts
+        # === Decomposed MLP ===
+        ffn_decompose=False,
+        ffn_rank=None,
+        **kwargs
+    ) -> None:
+        super().__init__(**kwargs)
+        # === Core Model ===
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_dropout_prob = attention_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.rms_norm_eps = rms_norm_eps
+        self.norm_type = norm_type
+        self.classifier_dropout = classifier_dropout
+        self.vocab_subspace = vocab_subspace
+        self.vocab_rank = vocab_rank
+        self.tie_word_embeddings = tie_word_embeddings
+        # === MLA ===
+        self.num_attention_heads = num_attention_heads
+        self.rope_dims = rope_dims
+        self.q_shared_dim = q_shared_dim
+        self.kv_shared_dim = kv_shared_dim
+        self.o_shared_dim = o_shared_dim
+        # Private head dimensions
+        self.qk_private_dim = qk_private_dim
+        self.vo_private_dim = vo_private_dim
+        self.nope_dims = nope_dims
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.num_dense_layers = num_dense_layers
+        # === Decomposed FFN ===
+        self.ffn_decompose = ffn_decompose
+        self.ffn_rank = ffn_rank
+        # === Attention backend ===
+        self.attention_backend = attention_backend
+        # === Validation ===
+        # TODO - Somewhere during training these get instantiated with bad
+        #        values...
+        #self._validate()
+        #print(f"  > SubEnc *Config.init: {make_shorthand(self)}\n")
+    def _validate(self):
+        # === Model ===
+        if self.num_dense_layers > self.num_hidden_layers:
+            raise ValueError("`num_dense_layers` must be <= `num_hidden_layers`")
+        if self.vocab_subspace and self.vocab_rank is None:
+            raise ValueError("`vocab_rank` must be set when `vocab_subspace=True`")
+        # === MLA Validation ===
+        # At least one of q_shared_dim or kv_shared_dim must be set if we have subspace layers
+        if self.num_dense_layers < self.num_hidden_layers and self.q_shared_dim is None and self.kv_shared_dim is None:
+            raise ValueError("At least one of q_shared_dim or kv_shared_dim must be set when there are subspace layers")
+        # Validate that private dimensions are set
+        if self.qk_private_dim is None or self.vo_private_dim is None:
+            raise ValueError("Must set qk_private_dim and vo_private_dim")
+        if self.nope_dims is None:
+            raise ValueError("Must set nope_dims")
+        # === Decomposed FFN ===
+        if self.ffn_decompose and self.ffn_rank is None:
+            raise ValueError("`ffn_rank` must be set when `ffn_decompose=True`")
+        if self.ffn_decompose and self.num_dense_layers >= self.num_hidden_layers:
+            raise ValueError("`ffn_decompose` was set but `num_dense` is >= number of layers")
+        # === Attention Backend ===
+        valid_backends = ["eager", "flash_attention_2", "sdpa"]
+        if self.attention_backend not in valid_backends:
+            raise ValueError(f"Unknown attention backend: {self.attention_backend}, options are {valid_backends}")
+        # === Norm Type ===
+        valid_norm_types = ["layernorm", "rmsnorm"]
+        if self.norm_type not in valid_norm_types:
+            raise ValueError(f"Unknown norm type: {self.norm_type}, options are {valid_norm_types}")
+#### `get_config`
+import json
+def get_config(filename):
+    # Load the config file.
+    with open(filename) as f:
+        full_cfg = json.load(f)
+    # Strict key check on the model configuration.
+    # Get the list of keys allowed / required by `*Config`
+    valid_keys = SharedSpaceDecoderConfig.__init__.__code__.co_varnames
+    # Remove `self` and `kwargs`
+    valid_keys = set(valid_keys) - {"self", "kwargs"}
+    # Compare the set of keys in the json file vs `*Config`
+    extra_keys = set(full_cfg["model"]) - valid_keys
+    missing_keys = valid_keys - set(full_cfg["model"])
+    # If there any in the `json` that aren't in `*Config`,
+    if extra_keys:
+        # List them for the user.
+        raise ValueError(f"Unknown keys in config: {sorted(extra_keys)}")
+    #  If the json config is missing required keys,
+    if missing_keys:
+        # List them for the user.
+        raise ValueError(f"config json is missing: {sorted(missing_keys)}")
+    # Will raise TypeError, by design, if required args are missing
+    # The asterisks unpack the dictionary into a list of keywords as though
+    # all of the settings were writting out individually.
+    model_cfg = SharedSpaceDecoderConfig(**full_cfg["model"])
+    return full_cfg, model_cfg

checkpoint-2700/shared_space_decoder.py ADDED Viewed

	@@ -0,0 +1,386 @@

+# -*- coding: utf-8 -*-
+"""# shared_subspace_encoder.py"""
+from typing import Optional
+import torch
+from torch import nn
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_utils import PreTrainedModel
+from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa
+from .mla import MultiheadLatentAttention, RotaryEmbedding
+from .feedforward import SubspaceFeedForward
+from .shared_space_config import SharedSpaceDecoderConfig
+"""`RMSNorm`
+From:
+https://huggingface.co/deepseek-ai/DeepSeek-R1/blob/main/modeling_deepseek.py
+TODO - May not need?
+"""
+class DeepseekV3RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        DeepseekV3RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+def create_norm_layer(hidden_size: int, config: SharedSpaceDecoderConfig) -> nn.Module:
+    """
+    Create a normalization layer based on the config norm_type.
+    Args:
+        hidden_size: The dimension to normalize over
+        config: Configuration containing norm_type and epsilon values
+    Returns:
+        Either a LayerNorm or RMSNorm layer
+    """
+    if config.norm_type == "layernorm":
+        return nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
+    elif config.norm_type == "rmsnorm":
+        return DeepseekV3RMSNorm(hidden_size, eps=config.rms_norm_eps)
+    else:
+        # This should be caught by config validation, but being defensive
+        raise ValueError(f"Unknown norm_type: {config.norm_type}")
+"""#### *PreTrainedModel"""
+class SharedSpaceDecoderPreTrainedModel(PreTrainedModel):
+    """
+    The **PreTrainedModel object:
+      - Is instantiated when TODO
+      - Initializes:
+        - TODO
+      - Provides access to TODO
+      - Executes TODO
+    """
+    config_class = SharedSpaceDecoderConfig
+    base_model_prefix = "model"
+    def _init_weights(self, module: nn.Module) -> None:
+        """Weight initialization hook used by :class:`PreTrainedModel`.
+        ``PreTrainedModel.post_init`` will recursively apply this function to
+        every submodule right after construction.  HuggingFace models override
+        it so that creating a model from scratch yields the same initialization
+        as ``from_pretrained`` when no checkpoint is supplied.
+        This decoder-specific initialization strategy includes:
+        - Proper handling of configurable normalization layers (LayerNorm or RMSNorm)
+        - Special initialization for language modeling heads
+        - Considerations for causal attention and autoregressive modeling
+        - Support for both dense and decomposed vocabulary embeddings
+        """
+        if isinstance(module, nn.Linear):
+            # Standard linear layer initialization
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            # Initialize embeddings with normal distribution
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, DeepseekV3RMSNorm):
+            # RMSNorm initialization: weight to 1.0, no bias term
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.LayerNorm):
+            # LayerNorm initialization: bias to 0, weight to 1.0
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+"""# ▂▂▂▂▂▂▂▂▂▂▂▂
+# Classes
+"""
+"""#### `*Layer`"""
+class SharedSpaceDecoderLayer(nn.Module):
+    """
+    The **Layer object:
+      - Is instantiated by :class:`SharedSpaceDecoderModel` for each
+        Transformer block in the decoder.
+      - Initializes:
+        - ``self_attn`` – multi-head latent attention implementing either
+          dense or latent projections depending on the configuration.
+        - ``ffn`` – a :class:`SubspaceFeedForward` block.
+        - RMSNorm layers for pre-attention and pre-FFN normalization.
+      - Provides access to the attention and feed-forward submodules via the
+        attributes ``self_attn`` and ``ffn``.
+      - Executes a single decoder block in :meth:`forward`.
+    """
+    def __init__(self, config: SharedSpaceDecoderConfig, layer_idx: int) -> None:
+        super().__init__()
+        # Norm applied prior to attention.
+        self.attn_input_norm = create_norm_layer(config.hidden_size, config)
+        # Attention block
+        self.self_attn = MultiheadLatentAttention(config, layer_idx)
+        # Norm applied prior to FFN
+        self.ffn_input_norm = create_norm_layer(config.hidden_size, config)
+        # Feed-forward network used after attention
+        self.ffn = SubspaceFeedForward(config, layer_idx)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor], # RoPE embeddings
+        attention_mask: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        # ========================
+        #     Self Attention
+        # ========================
+        residual_strm = hidden_states
+        # Normalize the hidden states to create the input to attention.
+        attn_input = self.attn_input_norm(hidden_states)
+        # Evaluate
+        attn_output = self.self_attn(
+            attn_input,
+            position_embeddings,
+            attention_mask,
+        )
+        # Add the attention output (the residual) back to the non-normalized
+        # hidden_states.
+        hidden_states = residual_strm + attn_output
+        # ===========================
+        #     Feed-Forward Network
+        # ===========================
+        residual_strm = hidden_states
+        # Normalize the updated hidden states prior to the FFN
+        ffn_input = self.ffn_input_norm(hidden_states)
+        # Evaluate
+        ffn_output = self.ffn(ffn_input)
+        # Add the output the un-normalized hidden states.
+        hidden_states = residual_strm + ffn_output
+        return hidden_states
+"""#### *Model"""
+class SharedSpaceDecoderModel(SharedSpaceDecoderPreTrainedModel):
+    """
+    The **Model object:
+      - Initializes:
+        - The vocabulary embeddings (and optional decomposition)
+        - Position embeddings (calculated in RotaryEmbedding)
+        - All of the **Layer objects.
+      - Provides interface to vocab embeddings.
+      - Executes the whole decoder model in `forward` with causal attention.
+      This is the base decoder without the language modeling head.
+      Use SubspaceDecoderForCausalLM for language modeling tasks.
+    """
+    def __init__(self, config: SharedSpaceDecoderConfig) -> None:
+        super().__init__(config)
+        # ============================
+        #    Vocabulary Embeddings
+        # ============================
+        # Decomposing the vocabulary (if enabled) defines a shared projection
+        # which constrains the model to store semantic information (and
+        # whatever other static token knowledge) into a limited set of
+        # feature directions.
+        # If we're decomposing the token embeddings,
+        # TODO - Rename to vocab_subspace.
+        if config.vocab_subspace:
+            # Create the embedding table. Vocabulary embeddings are learned
+            # in a lower dimensional latent space.
+            self.vocab_embed = nn.Embedding(
+                config.vocab_size, # Number of tokens
+                config.vocab_rank  # Subspace dimension
+            )
+            # Create a
+            # Selected token latents will be projected up to model size.
+            # vocab_proj has shape [vocab_rank x model_size]
+            self.vocab_proj = nn.Linear(
+                config.vocab_rank,  # Size of latents
+                config.hidden_size, # Model size
+                bias=False
+            )
+        # Otherwise, for a dense vocabulary,
+        else:
+            # Create the dense embedding table in model space.
+            self.vocab_embed = nn.Embedding(
+                config.vocab_size,  # Number of tokens
+                config.hidden_size  # Model size
+            )
+            self.vocab_proj = None
+        # =====================
+        #   RoPE Embeddings
+        # =====================
+        # Pre-computes the table of RoPE embeddings, leaving them in
+        # GPU memory.
+        self.rope = RotaryEmbedding(config)
+        # ===================
+        #    Create Layers
+        # ===================
+        layers = []
+        # For each layer,
+        for i in range(config.num_hidden_layers):
+            # Create a **Layer, providing the config and indicating its number.
+            layers.append(
+                SharedSpaceDecoderLayer(
+                    config,
+                    layer_idx = i
+                )
+            )
+        # Wrap in torch ModuleList
+        self.layers = nn.ModuleList(layers)
+        # Whatever huggingface does behind the scenes...
+        self.post_init()
+    # Agents: Do not define boilerplate helpers, e.g., get/set_input_embeddings
+    def embed(self, input_ids: torch.LongTensor) -> torch.Tensor:
+        """
+        Return token embeddings for input ids.
+        This will perform the up projection to model space if the vocabulary is
+        decomposed.
+        input_ids have shape [batch_size, seq_len]
+        """
+        # If the vocabulary is decomposed,
+        if self.vocab_proj is not None:
+            # Retrieve the latents
+            #  input_ids: [batch_size, seq_len]
+            #          x: [batch_size, seq_len, latent_dim]
+            x = self.vocab_embed(input_ids)
+            #  Project the latents back to model space and return.
+            return(self.vocab_proj(x))
+        # If the vocabulary is dense,
+        else:
+            # Just return the embeddings.
+            return self.vocab_embed(input_ids)
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        Run the full decoder stack with causal attention.
+        Inputs:
+            input_ids       [batch_size, seq_len]
+            attention_mask  [batch_size, seq_len] - 1 for real tokens, 0 for padding
+        Returns:
+            Final decoder layer output   [batch_size, seq_len, model_size]
+        """
+        # Retrieve the token embeddings for this sequence.
+        # These are model_size, regardless of whether the vocab is decompd.
+        hidden_states = self.embed(input_ids)
+        # Retrieve the rotary position embeddings for all of the positions in
+        # our current input sequence.
+        seq_len = hidden_states.size(1)
+        # Retrieves just the ones necessary for the sequence length of the
+        # input. These are vectors, two per token. Their length is the
+        # number of head dimensions we're applying RoPE to.
+        #  Input
+        #     cos: [max_seq_len, rope_dims]
+        #     sin: [max_seq_len, rope_dims]
+        #  Outputs:
+        #     R_cos [seq_len, rope_dims]
+        #     R_sin [seq_len, rope_dims]
+        R_cos = self.rope.cos[:seq_len]
+        R_sin = self.rope.sin[:seq_len]
+        # ===============================
+        #   Attention Mask Conversion
+        # ===============================
+        """
+        use_sdpa_attention_masks = (
+            self.attn_implementation == "sdpa"
+            and self.position_embedding_type == "absolute"
+            and head_mask is None
+            and not output_attentions
+        )
+        """
+        # Expand the attention mask
+        #if use_sdpa_attention_masks and attention_mask.dim() == 2:
+        if True:
+            # Expand the attention mask for SDPA.
+            # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len]
+            extended_attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                attention_mask,
+                hidden_states.dtype,
+                tgt_len = seq_len
+            )
+            attention_mask = extended_attention_mask
+        # Run the model!
+        # For each decoder layer,
+        for layer_i, layer in enumerate(self.layers):
+            # Evaluate the layer
+            hidden_states = layer(
+                hidden_states,       # Token embeddings
+                (R_cos, R_sin),      # Rope embeddings, passed as a tuple.
+                attention_mask,      # Attn mask
+            )
+        # Return the final output of the decoder stack.
+        return hidden_states

checkpoint-2700/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|endoftext|>",
+  "pad_token": "<|endoftext|>",
+  "unk_token": "<|endoftext|>"
+}

checkpoint-2700/task_heads.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional, Union
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from .shared_space_config import SharedSpaceDecoderConfig
+from .shared_space_decoder import (
+    SharedSpaceDecoderPreTrainedModel,
+    SharedSpaceDecoderModel,
+    DeepseekV3RMSNorm
+)
+def create_norm_layer(hidden_size: int, config: SharedSpaceDecoderConfig) -> nn.Module:
+    """
+    Create a normalization layer based on the config norm_type.
+    Args:
+        hidden_size: The dimension to normalize over
+        config: Configuration containing norm_type and epsilon values
+    Returns:
+        Either a LayerNorm or RMSNorm layer
+    """
+    if config.norm_type == "layernorm":
+        return nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
+    elif config.norm_type == "rmsnorm":
+        from .shared_space_decoder import DeepseekV3RMSNorm
+        return DeepseekV3RMSNorm(hidden_size, eps=config.rms_norm_eps)
+    else:
+        # This should be caught by config validation, but being defensive
+        raise ValueError(f"Unknown norm_type: {config.norm_type}")
+class SharedSpaceDecoderForCausalLM(SharedSpaceDecoderPreTrainedModel):
+    """
+    Subspace Decoder model with a causal language modeling head.
+    This model extends the SharedSpaceDecoderModel with:
+    - A language modeling head that projects hidden states to vocabulary logits
+    - Support for computing cross-entropy loss for language modeling
+    - Proper HuggingFace compatibility for causal language modeling tasks
+    - Decoder-specific initialization strategies
+    The model can be used for:
+    - Text generation
+    - Language modeling pretraining
+    - Fine-tuning on downstream tasks
+    """
+    def __init__(self, config: SharedSpaceDecoderConfig) -> None:
+        super().__init__(config)
+        # Initialize the base decoder model
+        self.model = SharedSpaceDecoderModel(config)
+        # Final layer norm before the language modeling head
+        self.norm = create_norm_layer(config.hidden_size, config)
+        # Language modeling head
+        # Projects from hidden_size to vocab_size to get logits for each token
+        self.lm_head = nn.Linear(
+            config.hidden_size,
+            config.vocab_size,
+            bias=False  # Following common practice in modern LMs
+        )
+        # Initialize weights with decoder-specific strategy
+        # Note: tie_weights() will be called automatically by post_init() if config.tie_word_embeddings=True
+        self.post_init()
+    def _init_weights(self, module: nn.Module) -> None:
+        """
+        Decoder-specific weight initialization with special handling for language modeling head.
+        Key differences from encoder initialization:
+        - Language modeling head gets specialized initialization for stability
+        - Configurable normalization layers (LayerNorm or RMSNorm) are properly handled
+        - Weight tying considerations for embedding/lm_head relationship
+        """
+        # Use the base class initialization for most modules
+        super()._init_weights(module)
+        # Special handling for language modeling head
+        if module is self.lm_head:
+            # Use smaller initialization for the language modeling head
+            # This helps with training stability in autoregressive generation
+            # Common practice is to use std=initializer_range or smaller
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            # If weight tying is not used, we might want even smaller init
+            if self.model.vocab_proj is not None:
+                # For vocab subspace models where weights aren't tied,
+                # use a smaller scale to prevent initial logits from being too large
+                module.weight.data.normal_(mean=0.0, std=self.config.initializer_range * 0.5)
+    def get_input_embeddings(self):
+        """Return the input embedding layer for compatibility with HuggingFace."""
+        return self.model.vocab_embed
+    def set_input_embeddings(self, value):
+        """Set the input embedding layer for compatibility with HuggingFace."""
+        self.model.vocab_embed = value
+    def get_output_embeddings(self):
+        """Return the output embedding layer (lm_head) for compatibility."""
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        """Set the output embedding layer for compatibility."""
+        self.lm_head = new_embeddings
+    def tie_weights(self):
+        """
+        Tie the input and output embedding weights.
+        This method sets the language modeling head's weight to be the same as
+        the input embedding weight. This reduces the number of parameters and
+        is a common practice in modern language models.
+        Note: For vocab subspace models, we need to handle the case where
+        input embeddings go through a projection layer.
+        """
+        # Only tie when embeddings live in model space (no vocab_proj)
+        if getattr(self.model, "vocab_proj", None) is None:
+            # Use HF utility for correct tying/cloning semantics
+            self._tie_or_clone_weights(self.lm_head, self.model.vocab_embed)
+        # else: leave untied for subspace case
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Union[CausalLMOutputWithPast, tuple]:
+        """
+        Forward pass for causal language modeling.
+        Args:
+            input_ids: Token ids of shape [batch_size, seq_len]
+            attention_mask: Attention mask of shape [batch_size, seq_len]
+                           (1 for real tokens, 0 for padding)
+            labels: Ground truth token ids for computing loss. Same shape as input_ids.
+                   If provided, loss will be computed. Typically input_ids shifted by 1.
+        Returns:
+            CausalLMOutputWithPast containing:
+            - logits: Prediction logits of shape [batch_size, seq_len, vocab_size]
+            - loss: Cross-entropy loss if labels provided, else None
+            - hidden_states: Final layer hidden states [batch_size, seq_len, hidden_size]
+        """
+        # Run the base decoder model
+        # This applies all the transformer layers with causal attention
+        hidden_states = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            **kwargs
+        )
+        # Apply final layer normalization
+        # This normalizes the final hidden states before the language modeling head
+        hidden_states = self.norm(hidden_states)
+        # Project to vocabulary logits
+        # Shape: [batch_size, seq_len, vocab_size]
+        logits = self.lm_head(hidden_states)
+        # Compute loss if labels are provided
+        # Previously, we had custom loss computation here, but now we use the
+        # standard HuggingFace loss function.
+        loss = None
+        if labels is not None:
+            # Flatten the tokens
+            loss = self.loss_function(
+                logits,
+                labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+        # Return in HuggingFace format
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=None,  # Not implementing KV cache yet
+            #hidden_states=hidden_states,
+            hidden_states=hidden_states if kwargs.get("output_hidden_states", False) else None,
+            attentions=None,
+        )

checkpoint-2700/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-2700/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "50256": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 1024,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}

checkpoint-2700/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1060 @@

+{
+  "best_global_step": 2700,
+  "best_metric": 3.600645640400387,
+  "best_model_checkpoint": "checkpoints/gpt-2_seq1024_mla0-0-0/checkpoint-2700",
+  "epoch": 0.8138045362067666,
+  "eval_steps": 300,
+  "global_step": 2700,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.006028181749679753,
+      "grad_norm": 2.0441031455993652,
+      "learning_rate": 3.166666666666667e-05,
+      "loss": 10.3921,
+      "step": 20
+    },
+    {
+      "epoch": 0.012056363499359506,
+      "grad_norm": 1.7166392803192139,
+      "learning_rate": 6.500000000000001e-05,
+      "loss": 9.241,
+      "step": 40
+    },
+    {
+      "epoch": 0.01808454524903926,
+      "grad_norm": 0.7614754438400269,
+      "learning_rate": 9.833333333333333e-05,
+      "loss": 8.2219,
+      "step": 60
+    },
+    {
+      "epoch": 0.02411272699871901,
+      "grad_norm": 0.46116286516189575,
+      "learning_rate": 0.00013166666666666665,
+      "loss": 7.5368,
+      "step": 80
+    },
+    {
+      "epoch": 0.030140908748398764,
+      "grad_norm": 1.6069883108139038,
+      "learning_rate": 0.000165,
+      "loss": 7.21,
+      "step": 100
+    },
+    {
+      "epoch": 0.03616909049807852,
+      "grad_norm": 1.2077178955078125,
+      "learning_rate": 0.00019833333333333335,
+      "loss": 6.8976,
+      "step": 120
+    },
+    {
+      "epoch": 0.04219727224775827,
+      "grad_norm": 0.642643928527832,
+      "learning_rate": 0.00023166666666666667,
+      "loss": 6.641,
+      "step": 140
+    },
+    {
+      "epoch": 0.04822545399743802,
+      "grad_norm": 0.7632517218589783,
+      "learning_rate": 0.00026500000000000004,
+      "loss": 6.4194,
+      "step": 160
+    },
+    {
+      "epoch": 0.05425363574711778,
+      "grad_norm": 1.2542935609817505,
+      "learning_rate": 0.00029833333333333334,
+      "loss": 6.2428,
+      "step": 180
+    },
+    {
+      "epoch": 0.06028181749679753,
+      "grad_norm": 1.1738799810409546,
+      "learning_rate": 0.0003316666666666667,
+      "loss": 6.1026,
+      "step": 200
+    },
+    {
+      "epoch": 0.06630999924647728,
+      "grad_norm": 0.46843603253364563,
+      "learning_rate": 0.000365,
+      "loss": 5.9586,
+      "step": 220
+    },
+    {
+      "epoch": 0.07233818099615703,
+      "grad_norm": 0.3949953019618988,
+      "learning_rate": 0.00039833333333333333,
+      "loss": 5.8304,
+      "step": 240
+    },
+    {
+      "epoch": 0.07836636274583679,
+      "grad_norm": 1.0866719484329224,
+      "learning_rate": 0.0004316666666666667,
+      "loss": 5.7185,
+      "step": 260
+    },
+    {
+      "epoch": 0.08439454449551655,
+      "grad_norm": 0.34839189052581787,
+      "learning_rate": 0.000465,
+      "loss": 5.6037,
+      "step": 280
+    },
+    {
+      "epoch": 0.09042272624519629,
+      "grad_norm": 0.5791395902633667,
+      "learning_rate": 0.0004983333333333334,
+      "loss": 5.5003,
+      "step": 300
+    },
+    {
+      "epoch": 0.09042272624519629,
+      "eval_loss": 5.428642945231348,
+      "eval_perplexity": 227.839844387096,
+      "eval_runtime": 113.1707,
+      "eval_samples_per_second": 14.659,
+      "eval_steps_per_second": 0.23,
+      "step": 300
+    },
+    {
+      "epoch": 0.09645090799487605,
+      "grad_norm": 0.532134473323822,
+      "learning_rate": 0.0004964814814814814,
+      "loss": 5.3805,
+      "step": 320
+    },
+    {
+      "epoch": 0.1024790897445558,
+      "grad_norm": 0.5876124501228333,
+      "learning_rate": 0.0004927777777777777,
+      "loss": 5.2775,
+      "step": 340
+    },
+    {
+      "epoch": 0.10850727149423556,
+      "grad_norm": 0.4140375256538391,
+      "learning_rate": 0.0004890740740740741,
+      "loss": 5.193,
+      "step": 360
+    },
+    {
+      "epoch": 0.1145354532439153,
+      "grad_norm": 0.5529780387878418,
+      "learning_rate": 0.0004853703703703704,
+      "loss": 5.0912,
+      "step": 380
+    },
+    {
+      "epoch": 0.12056363499359506,
+      "grad_norm": 0.3331276774406433,
+      "learning_rate": 0.0004816666666666667,
+      "loss": 5.0266,
+      "step": 400
+    },
+    {
+      "epoch": 0.1265918167432748,
+      "grad_norm": 0.6916081309318542,
+      "learning_rate": 0.00047796296296296297,
+      "loss": 4.9375,
+      "step": 420
+    },
+    {
+      "epoch": 0.13261999849295456,
+      "grad_norm": 0.31799590587615967,
+      "learning_rate": 0.0004742592592592593,
+      "loss": 4.9026,
+      "step": 440
+    },
+    {
+      "epoch": 0.1386481802426343,
+      "grad_norm": 0.5735257267951965,
+      "learning_rate": 0.00047055555555555555,
+      "loss": 4.8075,
+      "step": 460
+    },
+    {
+      "epoch": 0.14467636199231407,
+      "grad_norm": 0.5078163146972656,
+      "learning_rate": 0.00046685185185185187,
+      "loss": 4.759,
+      "step": 480
+    },
+    {
+      "epoch": 0.15070454374199382,
+      "grad_norm": 0.3884360194206238,
+      "learning_rate": 0.00046314814814814813,
+      "loss": 4.6706,
+      "step": 500
+    },
+    {
+      "epoch": 0.15673272549167358,
+      "grad_norm": 0.5591968297958374,
+      "learning_rate": 0.00045944444444444445,
+      "loss": 4.627,
+      "step": 520
+    },
+    {
+      "epoch": 0.16276090724135334,
+      "grad_norm": 0.29390859603881836,
+      "learning_rate": 0.0004557407407407407,
+      "loss": 4.5691,
+      "step": 540
+    },
+    {
+      "epoch": 0.1687890889910331,
+      "grad_norm": 0.3524036407470703,
+      "learning_rate": 0.00045203703703703703,
+      "loss": 4.5066,
+      "step": 560
+    },
+    {
+      "epoch": 0.17481727074071282,
+      "grad_norm": 0.4607946574687958,
+      "learning_rate": 0.0004483333333333333,
+      "loss": 4.4661,
+      "step": 580
+    },
+    {
+      "epoch": 0.18084545249039258,
+      "grad_norm": 0.2523379623889923,
+      "learning_rate": 0.00044462962962962967,
+      "loss": 4.4331,
+      "step": 600
+    },
+    {
+      "epoch": 0.18084545249039258,
+      "eval_loss": 4.357323692651888,
+      "eval_perplexity": 78.04797429242741,
+      "eval_runtime": 112.5783,
+      "eval_samples_per_second": 14.736,
+      "eval_steps_per_second": 0.231,
+      "step": 600
+    },
+    {
+      "epoch": 0.18687363424007233,
+      "grad_norm": 0.3953287601470947,
+      "learning_rate": 0.00044092592592592594,
+      "loss": 4.3877,
+      "step": 620
+    },
+    {
+      "epoch": 0.1929018159897521,
+      "grad_norm": 0.38012024760246277,
+      "learning_rate": 0.00043722222222222225,
+      "loss": 4.3619,
+      "step": 640
+    },
+    {
+      "epoch": 0.19892999773943185,
+      "grad_norm": 0.3135126233100891,
+      "learning_rate": 0.0004335185185185185,
+      "loss": 4.3442,
+      "step": 660
+    },
+    {
+      "epoch": 0.2049581794891116,
+      "grad_norm": 0.49422895908355713,
+      "learning_rate": 0.00042981481481481484,
+      "loss": 4.3045,
+      "step": 680
+    },
+    {
+      "epoch": 0.21098636123879136,
+      "grad_norm": 0.27019643783569336,
+      "learning_rate": 0.0004261111111111111,
+      "loss": 4.2808,
+      "step": 700
+    },
+    {
+      "epoch": 0.21701454298847112,
+      "grad_norm": 0.3935423791408539,
+      "learning_rate": 0.0004224074074074074,
+      "loss": 4.2467,
+      "step": 720
+    },
+    {
+      "epoch": 0.22304272473815084,
+      "grad_norm": 0.28830042481422424,
+      "learning_rate": 0.0004187037037037037,
+      "loss": 4.2274,
+      "step": 740
+    },
+    {
+      "epoch": 0.2290709064878306,
+      "grad_norm": 0.2877708673477173,
+      "learning_rate": 0.000415,
+      "loss": 4.2075,
+      "step": 760
+    },
+    {
+      "epoch": 0.23509908823751036,
+      "grad_norm": 0.3449930250644684,
+      "learning_rate": 0.00041129629629629627,
+      "loss": 4.1909,
+      "step": 780
+    },
+    {
+      "epoch": 0.2411272699871901,
+      "grad_norm": 0.326265424489975,
+      "learning_rate": 0.00040759259259259264,
+      "loss": 4.1684,
+      "step": 800
+    },
+    {
+      "epoch": 0.24715545173686987,
+      "grad_norm": 0.3821180760860443,
+      "learning_rate": 0.0004038888888888889,
+      "loss": 4.1513,
+      "step": 820
+    },
+    {
+      "epoch": 0.2531836334865496,
+      "grad_norm": 0.2768200635910034,
+      "learning_rate": 0.0004001851851851852,
+      "loss": 4.1317,
+      "step": 840
+    },
+    {
+      "epoch": 0.2592118152362294,
+      "grad_norm": 0.29724735021591187,
+      "learning_rate": 0.0003964814814814815,
+      "loss": 4.1171,
+      "step": 860
+    },
+    {
+      "epoch": 0.2652399969859091,
+      "grad_norm": 0.2814749479293823,
+      "learning_rate": 0.0003927777777777778,
+      "loss": 4.1049,
+      "step": 880
+    },
+    {
+      "epoch": 0.2712681787355889,
+      "grad_norm": 0.26310989260673523,
+      "learning_rate": 0.00038907407407407407,
+      "loss": 4.0858,
+      "step": 900
+    },
+    {
+      "epoch": 0.2712681787355889,
+      "eval_loss": 4.030442206068737,
+      "eval_perplexity": 56.285795665128816,
+      "eval_runtime": 112.6912,
+      "eval_samples_per_second": 14.722,
+      "eval_steps_per_second": 0.231,
+      "step": 900
+    },
+    {
+      "epoch": 0.2772963604852686,
+      "grad_norm": 0.3063352108001709,
+      "learning_rate": 0.0003853703703703704,
+      "loss": 4.0813,
+      "step": 920
+    },
+    {
+      "epoch": 0.2833245422349484,
+      "grad_norm": 0.4063352346420288,
+      "learning_rate": 0.00038166666666666666,
+      "loss": 4.0647,
+      "step": 940
+    },
+    {
+      "epoch": 0.28935272398462814,
+      "grad_norm": 0.2601182758808136,
+      "learning_rate": 0.000377962962962963,
+      "loss": 4.0447,
+      "step": 960
+    },
+    {
+      "epoch": 0.29538090573430786,
+      "grad_norm": 0.3153649568557739,
+      "learning_rate": 0.00037425925925925924,
+      "loss": 4.0417,
+      "step": 980
+    },
+    {
+      "epoch": 0.30140908748398765,
+      "grad_norm": 0.26004716753959656,
+      "learning_rate": 0.0003705555555555556,
+      "loss": 4.0279,
+      "step": 1000
+    },
+    {
+      "epoch": 0.3074372692336674,
+      "grad_norm": 0.43524181842803955,
+      "learning_rate": 0.0003668518518518519,
+      "loss": 4.0152,
+      "step": 1020
+    },
+    {
+      "epoch": 0.31346545098334716,
+      "grad_norm": 0.2601913511753082,
+      "learning_rate": 0.0003631481481481482,
+      "loss": 4.0026,
+      "step": 1040
+    },
+    {
+      "epoch": 0.3194936327330269,
+      "grad_norm": 0.25712424516677856,
+      "learning_rate": 0.00035944444444444446,
+      "loss": 3.9933,
+      "step": 1060
+    },
+    {
+      "epoch": 0.3255218144827067,
+      "grad_norm": 0.3313097357749939,
+      "learning_rate": 0.0003557407407407408,
+      "loss": 3.9842,
+      "step": 1080
+    },
+    {
+      "epoch": 0.3315499962323864,
+      "grad_norm": 0.238124281167984,
+      "learning_rate": 0.00035203703703703704,
+      "loss": 3.9745,
+      "step": 1100
+    },
+    {
+      "epoch": 0.3375781779820662,
+      "grad_norm": 0.28039032220840454,
+      "learning_rate": 0.00034833333333333336,
+      "loss": 3.9588,
+      "step": 1120
+    },
+    {
+      "epoch": 0.3436063597317459,
+      "grad_norm": 0.3661600649356842,
+      "learning_rate": 0.0003446296296296296,
+      "loss": 3.9532,
+      "step": 1140
+    },
+    {
+      "epoch": 0.34963454148142564,
+      "grad_norm": 0.22854666411876678,
+      "learning_rate": 0.0003409259259259259,
+      "loss": 3.9392,
+      "step": 1160
+    },
+    {
+      "epoch": 0.3556627232311054,
+      "grad_norm": 0.2930833697319031,
+      "learning_rate": 0.0003372222222222222,
+      "loss": 3.9312,
+      "step": 1180
+    },
+    {
+      "epoch": 0.36169090498078516,
+      "grad_norm": 0.25500425696372986,
+      "learning_rate": 0.0003335185185185185,
+      "loss": 3.9295,
+      "step": 1200
+    },
+    {
+      "epoch": 0.36169090498078516,
+      "eval_loss": 3.874079793811062,
+      "eval_perplexity": 48.13838066773452,
+      "eval_runtime": 112.5895,
+      "eval_samples_per_second": 14.735,
+      "eval_steps_per_second": 0.231,
+      "step": 1200
+    },
+    {
+      "epoch": 0.36771908673046494,
+      "grad_norm": 0.27924397587776184,
+      "learning_rate": 0.00032981481481481485,
+      "loss": 3.9157,
+      "step": 1220
+    },
+    {
+      "epoch": 0.37374726848014467,
+      "grad_norm": 0.2649654448032379,
+      "learning_rate": 0.0003261111111111111,
+      "loss": 3.9136,
+      "step": 1240
+    },
+    {
+      "epoch": 0.37977545022982445,
+      "grad_norm": 0.27920904755592346,
+      "learning_rate": 0.00032240740740740743,
+      "loss": 3.899,
+      "step": 1260
+    },
+    {
+      "epoch": 0.3858036319795042,
+      "grad_norm": 0.238219752907753,
+      "learning_rate": 0.0003187037037037037,
+      "loss": 3.8965,
+      "step": 1280
+    },
+    {
+      "epoch": 0.3918318137291839,
+      "grad_norm": 0.2615567743778229,
+      "learning_rate": 0.000315,
+      "loss": 3.8877,
+      "step": 1300
+    },
+    {
+      "epoch": 0.3978599954788637,
+      "grad_norm": 0.266513466835022,
+      "learning_rate": 0.0003112962962962963,
+      "loss": 3.8806,
+      "step": 1320
+    },
+    {
+      "epoch": 0.4038881772285434,
+      "grad_norm": 0.3212108314037323,
+      "learning_rate": 0.0003075925925925926,
+      "loss": 3.8774,
+      "step": 1340
+    },
+    {
+      "epoch": 0.4099163589782232,
+      "grad_norm": 0.231920063495636,
+      "learning_rate": 0.00030388888888888886,
+      "loss": 3.8707,
+      "step": 1360
+    },
+    {
+      "epoch": 0.41594454072790293,
+      "grad_norm": 0.27022039890289307,
+      "learning_rate": 0.0003001851851851852,
+      "loss": 3.863,
+      "step": 1380
+    },
+    {
+      "epoch": 0.4219727224775827,
+      "grad_norm": 0.243785560131073,
+      "learning_rate": 0.00029648148148148144,
+      "loss": 3.8562,
+      "step": 1400
+    },
+    {
+      "epoch": 0.42800090422726245,
+      "grad_norm": 0.2512820065021515,
+      "learning_rate": 0.0002927777777777778,
+      "loss": 3.8541,
+      "step": 1420
+    },
+    {
+      "epoch": 0.43402908597694223,
+      "grad_norm": 0.25888124108314514,
+      "learning_rate": 0.0002890740740740741,
+      "loss": 3.8421,
+      "step": 1440
+    },
+    {
+      "epoch": 0.44005726772662196,
+      "grad_norm": 0.29554685950279236,
+      "learning_rate": 0.0002853703703703704,
+      "loss": 3.8468,
+      "step": 1460
+    },
+    {
+      "epoch": 0.4460854494763017,
+      "grad_norm": 0.2608514130115509,
+      "learning_rate": 0.00028166666666666666,
+      "loss": 3.837,
+      "step": 1480
+    },
+    {
+      "epoch": 0.45211363122598147,
+      "grad_norm": 0.2781771123409271,
+      "learning_rate": 0.000277962962962963,
+      "loss": 3.8327,
+      "step": 1500
+    },
+    {
+      "epoch": 0.45211363122598147,
+      "eval_loss": 3.776173532413324,
+      "eval_perplexity": 43.6487014373908,
+      "eval_runtime": 112.7037,
+      "eval_samples_per_second": 14.72,
+      "eval_steps_per_second": 0.231,
+      "step": 1500
+    },
+    {
+      "epoch": 0.4581418129756612,
+      "grad_norm": 0.27767959237098694,
+      "learning_rate": 0.00027425925925925925,
+      "loss": 3.8264,
+      "step": 1520
+    },
+    {
+      "epoch": 0.464169994725341,
+      "grad_norm": 0.24143943190574646,
+      "learning_rate": 0.00027055555555555557,
+      "loss": 3.8224,
+      "step": 1540
+    },
+    {
+      "epoch": 0.4701981764750207,
+      "grad_norm": 0.23281621932983398,
+      "learning_rate": 0.00026685185185185183,
+      "loss": 3.8114,
+      "step": 1560
+    },
+    {
+      "epoch": 0.4762263582247005,
+      "grad_norm": 0.23552870750427246,
+      "learning_rate": 0.00026314814814814815,
+      "loss": 3.8086,
+      "step": 1580
+    },
+    {
+      "epoch": 0.4822545399743802,
+      "grad_norm": 0.25271207094192505,
+      "learning_rate": 0.0002594444444444444,
+      "loss": 3.7954,
+      "step": 1600
+    },
+    {
+      "epoch": 0.48828272172405995,
+      "grad_norm": 0.2609969675540924,
+      "learning_rate": 0.0002557407407407408,
+      "loss": 3.8052,
+      "step": 1620
+    },
+    {
+      "epoch": 0.49431090347373974,
+      "grad_norm": 0.290229469537735,
+      "learning_rate": 0.00025203703703703705,
+      "loss": 3.7946,
+      "step": 1640
+    },
+    {
+      "epoch": 0.5003390852234195,
+      "grad_norm": 0.2184811681509018,
+      "learning_rate": 0.0002483333333333333,
+      "loss": 3.7913,
+      "step": 1660
+    },
+    {
+      "epoch": 0.5063672669730992,
+      "grad_norm": 0.24770255386829376,
+      "learning_rate": 0.00024462962962962963,
+      "loss": 3.7902,
+      "step": 1680
+    },
+    {
+      "epoch": 0.512395448722779,
+      "grad_norm": 0.2306816428899765,
+      "learning_rate": 0.00024092592592592593,
+      "loss": 3.7775,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5184236304724588,
+      "grad_norm": 0.2789316475391388,
+      "learning_rate": 0.00023722222222222222,
+      "loss": 3.7774,
+      "step": 1720
+    },
+    {
+      "epoch": 0.5244518122221385,
+      "grad_norm": 0.2576800286769867,
+      "learning_rate": 0.0002335185185185185,
+      "loss": 3.7773,
+      "step": 1740
+    },
+    {
+      "epoch": 0.5304799939718182,
+      "grad_norm": 0.2184872031211853,
+      "learning_rate": 0.0002298148148148148,
+      "loss": 3.7739,
+      "step": 1760
+    },
+    {
+      "epoch": 0.536508175721498,
+      "grad_norm": 0.27034640312194824,
+      "learning_rate": 0.00022611111111111112,
+      "loss": 3.7632,
+      "step": 1780
+    },
+    {
+      "epoch": 0.5425363574711778,
+      "grad_norm": 0.2661360800266266,
+      "learning_rate": 0.0002224074074074074,
+      "loss": 3.7668,
+      "step": 1800
+    },
+    {
+      "epoch": 0.5425363574711778,
+      "eval_loss": 3.7104278931619175,
+      "eval_perplexity": 40.871291331988935,
+      "eval_runtime": 112.585,
+      "eval_samples_per_second": 14.736,
+      "eval_steps_per_second": 0.231,
+      "step": 1800
+    },
+    {
+      "epoch": 0.5485645392208575,
+      "grad_norm": 0.2934229373931885,
+      "learning_rate": 0.0002187037037037037,
+      "loss": 3.7645,
+      "step": 1820
+    },
+    {
+      "epoch": 0.5545927209705372,
+      "grad_norm": 0.24501343071460724,
+      "learning_rate": 0.000215,
+      "loss": 3.7559,
+      "step": 1840
+    },
+    {
+      "epoch": 0.560620902720217,
+      "grad_norm": 0.21381346881389618,
+      "learning_rate": 0.00021129629629629629,
+      "loss": 3.7482,
+      "step": 1860
+    },
+    {
+      "epoch": 0.5666490844698968,
+      "grad_norm": 0.2350403219461441,
+      "learning_rate": 0.0002075925925925926,
+      "loss": 3.7488,
+      "step": 1880
+    },
+    {
+      "epoch": 0.5726772662195765,
+      "grad_norm": 0.23535749316215515,
+      "learning_rate": 0.0002038888888888889,
+      "loss": 3.7409,
+      "step": 1900
+    },
+    {
+      "epoch": 0.5787054479692563,
+      "grad_norm": 0.22763009369373322,
+      "learning_rate": 0.0002001851851851852,
+      "loss": 3.7431,
+      "step": 1920
+    },
+    {
+      "epoch": 0.584733629718936,
+      "grad_norm": 0.22279077768325806,
+      "learning_rate": 0.00019648148148148148,
+      "loss": 3.7373,
+      "step": 1940
+    },
+    {
+      "epoch": 0.5907618114686157,
+      "grad_norm": 0.19868987798690796,
+      "learning_rate": 0.00019277777777777777,
+      "loss": 3.7385,
+      "step": 1960
+    },
+    {
+      "epoch": 0.5967899932182955,
+      "grad_norm": 0.22577470541000366,
+      "learning_rate": 0.00018907407407407406,
+      "loss": 3.7266,
+      "step": 1980
+    },
+    {
+      "epoch": 0.6028181749679753,
+      "grad_norm": 0.23861481249332428,
+      "learning_rate": 0.00018537037037037038,
+      "loss": 3.7288,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6088463567176551,
+      "grad_norm": 0.2368811070919037,
+      "learning_rate": 0.00018166666666666667,
+      "loss": 3.7251,
+      "step": 2020
+    },
+    {
+      "epoch": 0.6148745384673348,
+      "grad_norm": 0.23746469616889954,
+      "learning_rate": 0.00017796296296296296,
+      "loss": 3.7194,
+      "step": 2040
+    },
+    {
+      "epoch": 0.6209027202170145,
+      "grad_norm": 0.19934986531734467,
+      "learning_rate": 0.00017425925925925926,
+      "loss": 3.7203,
+      "step": 2060
+    },
+    {
+      "epoch": 0.6269309019666943,
+      "grad_norm": 0.22287386655807495,
+      "learning_rate": 0.00017055555555555555,
+      "loss": 3.7182,
+      "step": 2080
+    },
+    {
+      "epoch": 0.632959083716374,
+      "grad_norm": 0.21625646948814392,
+      "learning_rate": 0.00016685185185185187,
+      "loss": 3.7133,
+      "step": 2100
+    },
+    {
+      "epoch": 0.632959083716374,
+      "eval_loss": 3.661569493497066,
+      "eval_perplexity": 38.922383385070816,
+      "eval_runtime": 112.8832,
+      "eval_samples_per_second": 14.697,
+      "eval_steps_per_second": 0.23,
+      "step": 2100
+    },
+    {
+      "epoch": 0.6389872654660538,
+      "grad_norm": 0.2128862887620926,
+      "learning_rate": 0.00016314814814814816,
+      "loss": 3.7123,
+      "step": 2120
+    },
+    {
+      "epoch": 0.6450154472157336,
+      "grad_norm": 0.22579094767570496,
+      "learning_rate": 0.00015944444444444445,
+      "loss": 3.7103,
+      "step": 2140
+    },
+    {
+      "epoch": 0.6510436289654133,
+      "grad_norm": 0.19634632766246796,
+      "learning_rate": 0.00015574074074074074,
+      "loss": 3.7077,
+      "step": 2160
+    },
+    {
+      "epoch": 0.657071810715093,
+      "grad_norm": 0.21508827805519104,
+      "learning_rate": 0.00015203703703703703,
+      "loss": 3.7071,
+      "step": 2180
+    },
+    {
+      "epoch": 0.6630999924647728,
+      "grad_norm": 0.23243261873722076,
+      "learning_rate": 0.00014833333333333335,
+      "loss": 3.7035,
+      "step": 2200
+    },
+    {
+      "epoch": 0.6691281742144526,
+      "grad_norm": 0.22189666330814362,
+      "learning_rate": 0.00014462962962962964,
+      "loss": 3.7011,
+      "step": 2220
+    },
+    {
+      "epoch": 0.6751563559641324,
+      "grad_norm": 0.19845160841941833,
+      "learning_rate": 0.00014092592592592594,
+      "loss": 3.6996,
+      "step": 2240
+    },
+    {
+      "epoch": 0.681184537713812,
+      "grad_norm": 0.1785988211631775,
+      "learning_rate": 0.00013722222222222223,
+      "loss": 3.6866,
+      "step": 2260
+    },
+    {
+      "epoch": 0.6872127194634918,
+      "grad_norm": 0.2001648247241974,
+      "learning_rate": 0.00013351851851851852,
+      "loss": 3.6939,
+      "step": 2280
+    },
+    {
+      "epoch": 0.6932409012131716,
+      "grad_norm": 0.21330291032791138,
+      "learning_rate": 0.00012981481481481484,
+      "loss": 3.6868,
+      "step": 2300
+    },
+    {
+      "epoch": 0.6992690829628513,
+      "grad_norm": 0.23209834098815918,
+      "learning_rate": 0.00012611111111111113,
+      "loss": 3.6803,
+      "step": 2320
+    },
+    {
+      "epoch": 0.7052972647125311,
+      "grad_norm": 0.18792849779129028,
+      "learning_rate": 0.00012240740740740742,
+      "loss": 3.6824,
+      "step": 2340
+    },
+    {
+      "epoch": 0.7113254464622109,
+      "grad_norm": 0.19127227365970612,
+      "learning_rate": 0.0001187037037037037,
+      "loss": 3.6821,
+      "step": 2360
+    },
+    {
+      "epoch": 0.7173536282118906,
+      "grad_norm": 0.21657773852348328,
+      "learning_rate": 0.000115,
+      "loss": 3.6727,
+      "step": 2380
+    },
+    {
+      "epoch": 0.7233818099615703,
+      "grad_norm": 0.1759604662656784,
+      "learning_rate": 0.0001112962962962963,
+      "loss": 3.6809,
+      "step": 2400
+    },
+    {
+      "epoch": 0.7233818099615703,
+      "eval_loss": 3.62554258938625,
+      "eval_perplexity": 37.54508920080861,
+      "eval_runtime": 112.4817,
+      "eval_samples_per_second": 14.749,
+      "eval_steps_per_second": 0.231,
+      "step": 2400
+    },
+    {
+      "epoch": 0.7294099917112501,
+      "grad_norm": 0.20114636421203613,
+      "learning_rate": 0.00010759259259259259,
+      "loss": 3.6772,
+      "step": 2420
+    },
+    {
+      "epoch": 0.7354381734609299,
+      "grad_norm": 0.20904237031936646,
+      "learning_rate": 0.00010388888888888889,
+      "loss": 3.6754,
+      "step": 2440
+    },
+    {
+      "epoch": 0.7414663552106096,
+      "grad_norm": 0.18807630240917206,
+      "learning_rate": 0.00010018518518518518,
+      "loss": 3.6756,
+      "step": 2460
+    },
+    {
+      "epoch": 0.7474945369602893,
+      "grad_norm": 0.17234675586223602,
+      "learning_rate": 9.648148148148149e-05,
+      "loss": 3.6686,
+      "step": 2480
+    },
+    {
+      "epoch": 0.7535227187099691,
+      "grad_norm": 0.18095088005065918,
+      "learning_rate": 9.277777777777778e-05,
+      "loss": 3.6691,
+      "step": 2500
+    },
+    {
+      "epoch": 0.7595509004596489,
+      "grad_norm": 0.19775047898292542,
+      "learning_rate": 8.907407407407407e-05,
+      "loss": 3.6679,
+      "step": 2520
+    },
+    {
+      "epoch": 0.7655790822093286,
+      "grad_norm": 0.1963493674993515,
+      "learning_rate": 8.537037037037038e-05,
+      "loss": 3.668,
+      "step": 2540
+    },
+    {
+      "epoch": 0.7716072639590084,
+      "grad_norm": 0.16776752471923828,
+      "learning_rate": 8.166666666666667e-05,
+      "loss": 3.663,
+      "step": 2560
+    },
+    {
+      "epoch": 0.7776354457086881,
+      "grad_norm": 0.18634021282196045,
+      "learning_rate": 7.796296296296296e-05,
+      "loss": 3.6616,
+      "step": 2580
+    },
+    {
+      "epoch": 0.7836636274583678,
+      "grad_norm": 0.1853896975517273,
+      "learning_rate": 7.425925925925927e-05,
+      "loss": 3.6554,
+      "step": 2600
+    },
+    {
+      "epoch": 0.7896918092080476,
+      "grad_norm": 0.19131463766098022,
+      "learning_rate": 7.055555555555556e-05,
+      "loss": 3.6617,
+      "step": 2620
+    },
+    {
+      "epoch": 0.7957199909577274,
+      "grad_norm": 0.1861460953950882,
+      "learning_rate": 6.685185185185186e-05,
+      "loss": 3.6589,
+      "step": 2640
+    },
+    {
+      "epoch": 0.8017481727074072,
+      "grad_norm": 0.18328429758548737,
+      "learning_rate": 6.314814814814815e-05,
+      "loss": 3.6581,
+      "step": 2660
+    },
+    {
+      "epoch": 0.8077763544570868,
+      "grad_norm": 0.1829613447189331,
+      "learning_rate": 5.9444444444444445e-05,
+      "loss": 3.6508,
+      "step": 2680
+    },
+    {
+      "epoch": 0.8138045362067666,
+      "grad_norm": 0.17789465188980103,
+      "learning_rate": 5.5740740740740744e-05,
+      "loss": 3.6536,
+      "step": 2700
+    },
+    {
+      "epoch": 0.8138045362067666,
+      "eval_loss": 3.600645640400387,
+      "eval_perplexity": 36.62187137207453,
+      "eval_runtime": 112.2753,
+      "eval_samples_per_second": 14.776,
+      "eval_steps_per_second": 0.232,
+      "step": 2700
+    }
+  ],
+  "logging_steps": 20,
+  "max_steps": 3000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 300,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.4440968002469888e+18,
+  "train_batch_size": 64,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-2700/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:08d76f8b5b85f05fb7caa15e408143d3ba0686e42d70879850259702897d08d2
+size 5905

checkpoint-2700/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-3000/config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "architectures": [
+    "SharedSpaceDecoderForCausalLM"
+  ],
+  "attention_backend": "flash_attention_2",
+  "attention_bias": false,
+  "attention_dropout_prob": 0.1,
+  "bos_token_id": 50256,
+  "classifier_dropout": null,
+  "dtype": "float32",
+  "eos_token_id": 50256,
+  "ffn_decompose": false,
+  "ffn_rank": null,
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 2048,
+  "kv_shared_dim": null,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 1024,
+  "model_type": "shared_subspace_decoder",
+  "nope_dims": 32,
+  "norm_type": "rmsnorm",
+  "num_attention_heads": 12,
+  "num_dense_layers": 0,
+  "num_hidden_layers": 12,
+  "o_shared_dim": null,
+  "pad_token_id": 50256,
+  "q_shared_dim": null,
+  "qk_private_dim": 64,
+  "rms_norm_eps": 1e-06,
+  "rope_dims": 32,
+  "rope_scaling": {
+    "factor": 2.0,
+    "type": "linear"
+  },
+  "rope_theta": 10000.0,
+  "transformers_version": "4.56.0",
+  "vo_private_dim": 64,
+  "vocab_rank": null,
+  "vocab_size": 50257,
+  "vocab_subspace": false
+}

checkpoint-3000/feedforward.py ADDED Viewed

	@@ -0,0 +1,196 @@

+"""# ▂▂▂▂▂▂▂▂▂▂▂▂
+# `feedforward.py`
+Regarding dropout:
+- I don't see it applied to the MoE in DeepSeek-V3, [here](https://huggingface.co/deepseek-ai/DeepSeek-R1/blob/main/modeling_deepseek.py).
+- I don't see it applied in [modeling_llama.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L140)
+Norms:
+* nn.RMSNorm [here](https://docs.pytorch.org/docs/stable/generated/torch.nn.RMSNorm.html)
+## FFN
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .shared_space_config import SharedSpaceDecoderConfig
+def create_norm_layer(hidden_size: int, config: SharedSpaceDecoderConfig) -> nn.Module:
+    """
+    Create a normalization layer based on the config norm_type.
+    Args:
+        hidden_size: The dimension to normalize over
+        config: Configuration containing norm_type and epsilon values
+    Returns:
+        Either a LayerNorm or RMSNorm layer
+    """
+    if config.norm_type == "layernorm":
+        return nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
+    elif config.norm_type == "rmsnorm":
+        return DeepseekV3RMSNorm(hidden_size, eps=config.rms_norm_eps)
+    else:
+        # This should be caught by config validation, but being defensive
+        raise ValueError(f"Unknown norm_type: {config.norm_type}")
+# TODO - Find a shared place to put this.
+class DeepseekV3RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        DeepseekV3RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+class SubspaceFeedForward(nn.Module):
+    """
+    Feed-forward block for SharedSpaceDecoder.
+    Implements SwiGLU:
+        FFN(x) = W_out( Swish(W_in(x)) ⊙ W_gate(x) ) + residual
+    Supports both dense and decomposed MLP variants.
+    Dense:
+        - W_in:   Linear(hidden_dim → intermediate_dim)
+        - W_gate: Linear(hidden_dim → intermediate_dim)
+        - W_out:  Linear(intermediate_dim → hidden_dim)
+    Decomposed:
+        - W_in_shared:   Linear(hidden_dim → rank, bias=False)
+        - W_in_shared_norm: RMSNorm
+        - W_in:          Linear(rank → intermediate_dim)
+        - W_gate_shared: Linear(hidden_dim → rank, bias=False)
+        - W_gate_shared_norm: RMSNorm
+        - W_gate:        Linear(rank → intermediate_dim)
+        - W_out:         Linear(intermediate_dim → rank, bias=False)
+        - W_out_shared:  Linear(rank → hidden_dim)
+    Residual, dropout, and post-norm are handled inside the block.
+    """
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        #dropout_prob = config.hidden_dropout_prob # TODO - Style -- don't define variables if only used once.
+        # Determine whether this is a dense or decomposed layer.
+        # It's dense if either:
+        #  - ffn_decompose is disabled (no dense layers at all)
+        #  - ffn_decompose is enabled, but this is one of the early dense layers.
+        self.is_dense = (not config.ffn_decompose) or (layer_idx < config.num_dense_layers)
+        hidden_dim = config.hidden_size
+        intermediate_dim = config.intermediate_size # TODO - Find something shorter, and use the same name.
+        # If it's one of the dense layers,
+        if self.is_dense:
+            # === Dense FFN Projections ===
+            self.W_in = nn.Linear(hidden_dim, intermediate_dim)
+            self.W_gate = nn.Linear(hidden_dim, intermediate_dim)
+            self.W_out = nn.Linear(intermediate_dim, hidden_dim)
+        # Define weights for the decomposed version.
+        else:
+            rank = config.ffn_rank
+            print("hidden_dim:", hidden_dim)
+            print("rank:", rank)
+            # === Input Projections ===
+            self.W_in_shared = nn.Linear(hidden_dim, rank, bias=False)
+            self.W_in_shared_norm = create_norm_layer(rank, config)
+            self.W_in = nn.Linear(rank, intermediate_dim, bias=True)
+            # === Gate Projections ===
+            self.W_gate_shared = nn.Linear(hidden_dim, rank, bias=False)
+            self.W_gate_shared_norm = create_norm_layer(rank, config)
+            self.W_gate = nn.Linear(rank, intermediate_dim, bias=True)
+            # === Output Projection ===
+            self.W_out = nn.Linear(intermediate_dim, rank, bias=False)
+            # TODO - Could experiment with this.
+            #self.W_out_shared_layernorm = DeepseekV3RMSNorm(rank, eps=config.eps)
+            self.W_out_shared = nn.Linear(rank, hidden_dim, bias=True)
+        # See notes no dropout
+        #self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # === Tensor Dimension Symbols ===
+        # B: batch_size     — number of samples in the batch
+        # T: seq_len        — number of tokens per sample
+        # D: hidden_dim     — model embedding size
+        # R: ffn_rank       — latent shared subspace dimension
+        # D_ff: intermediate_size — FFN hidden dimension
+        # =========================
+        #    Gated Feedforward
+        # =========================
+        if self.is_dense:
+            # =============
+            #     Dense
+            # =============
+            # Input:  x [B, T, D]
+            # Output: x_proj [B, T, D_ff]
+            x_proj = self.W_in(x)
+            # Output: gate [B, T, D_ff]
+            gate = self.W_gate(x)
+            # SwiGLU nonlinearity
+            x = F.silu(x_proj) * gate  # [B, T, D_ff]
+            # See notes on dropout
+            #x = self.dropout(x)
+            # Output: x [B, T, D]
+            x = self.W_out(x)
+        else:
+            # ==================
+            #     Decomposed
+            # ==================
+            # Input:  x [B, T, D]
+            # Output: x_proj [B, T, D_ff]
+            x_proj = self.W_in(self.W_in_shared_norm(self.W_in_shared(x)))
+            # Input:  x [B, T, D]
+            # Output: gate [B, T, D_ff]
+            gate = self.W_gate(self.W_gate_shared_norm(self.W_gate_shared(x)))
+            # SwiGLU nonlinearity
+            x = F.silu(x_proj) * gate  # [B, T, D_ff]
+            # See notes on dropout
+            #x = self.dropout(x)
+            # Output: x [B, T, D]
+            x = self.W_out_shared(self.W_out(x))
+        return x

checkpoint-3000/gla.py ADDED Viewed

	@@ -0,0 +1,721 @@

+"""# ▂▂▂▂▂▂▂▂▂▂▂▂
+# `gla.py`
+Based on: https://huggingface.co/deepseek-ai/DeepSeek-R1/blob/main/modeling_deepseek.py
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional
+import math
+from .shared_space_config import SharedSpaceDecoderConfig
+def create_norm_layer(hidden_size: int, config: SharedSpaceDecoderConfig) -> nn.Module:
+    """
+    Create a normalization layer based on the config norm_type.
+    If `hidden_size` is `None`, this returns an identity layer.
+    Args:
+        hidden_size: The dimension to normalize over
+        config: Configuration containing norm_type and epsilon values
+    Returns:
+        Either a LayerNorm or RMSNorm layer
+    """
+    if hidden_size is None:
+        return nn.Identity()
+    elif config.norm_type == "layernorm":
+        return nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
+    elif config.norm_type == "rmsnorm":
+        return DeepseekV3RMSNorm(hidden_size, eps=config.rms_norm_eps)
+    else:
+        # This should be caught by config validation, but being defensive
+        raise ValueError(f"Unknown norm_type: {config.norm_type}")
+# TODO - Find a shared place to put this.
+class DeepseekV3RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        DeepseekV3RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+# Helper function needed because it's called twice during RoPE,
+# but I dumped it in the comments there.
+# TODO - Nah, screw it, just write it twice! At least then you get
+# to use the word 'query' instead of 'x'.
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+class RotaryEmbedding(nn.Module):
+    """Precompute RoPE embeddings and store them as buffers."""
+    def __init__(self, config: SharedSpaceDecoderConfig) -> None:
+        super().__init__()
+        dim = config.rope_dims
+        seq_len = config.max_position_embeddings
+        # ------------------------------
+        # Compute inverse frequencies
+        # ------------------------------
+        # Shape: [dim // 2]
+        #   inv_freq[i] = 1 / (theta^(i / dim))
+        inv_freq = 1.0 / (
+            config.rope_theta
+            ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)
+        )
+        # ------------------------------
+        # Apply RoPE scaling if configured
+        # ------------------------------
+        if config.rope_scaling is not None:
+            scaling_type = config.rope_scaling.get("type", "linear")
+            scaling_factor = config.rope_scaling.get("factor", 1.0)
+            if scaling_type == "linear":
+                # Linear scaling: divide frequencies by scaling factor
+                inv_freq = inv_freq / scaling_factor
+            elif scaling_type == "dynamic":
+                # Dynamic scaling: adjust based on sequence length
+                # This is a simplified implementation
+                inv_freq = inv_freq / scaling_factor
+            else:
+                print(f"Warning: Unknown RoPE scaling type '{scaling_type}', using linear scaling")
+                inv_freq = inv_freq / scaling_factor
+        # ------------------------------
+        # Compute position indices
+        # ------------------------------
+        # Shape: [seq_len]
+        t = torch.arange(seq_len, dtype=torch.float32)
+        # ------------------------------
+        # Outer product: [seq_len, dim // 2]
+        # Each row i contains: t[i] * inv_freq
+        # ------------------------------
+        freqs = torch.outer(t, inv_freq)
+        # ------------------------------
+        # Duplicate for interleaved sin/cos: [seq_len, dim]
+        # This matches the common format: [sin_0, cos_0, sin_1, cos_1, ...]
+        # ------------------------------
+        emb = torch.cat((freqs, freqs), dim=-1)
+        # ------------------------------
+        # Register cos/sin as buffers
+        # - Stored in float32
+        # - Will be moved to correct device/dtype via model.to(...)
+        # - Not saved with state_dict (persistent=False)
+        # ------------------------------
+        self.register_buffer("cos", emb.cos(), persistent=False)
+        self.register_buffer("sin", emb.sin(), persistent=False)
+    def forward(self, position_ids: torch.LongTensor) -> tuple[torch.Tensor, torch.Tensor]:
+        """ """
+        return None # This function is not necessary.
+"""## GLA"""
+class GroupedLatentAttention(nn.Module):
+    """
+    This version of Multihead Latent Attention applies the re-ordering trick from DeepSeekV3.
+    Instead of comparing the queries and keys in the query-key space, we compare them in the
+    kv-shared space.
+    For clarity, I've re-interpreted the naming of the heads, and am framing it as MQA.
+    What were previously labeled the query and key heads are now treated as a low-rank decomposition
+    of the query heads.
+    What we considered the "shared key/value space" is now a single key head that is also used as the
+    value head.
+    Finally, what we previously labeled the value and output heads are now treated as a low-rank
+    decomposition of the output heads.
+    This interpretation / implementation is designed to leverage the performance benefits of GQA.
+    The trade-off is that the query-key matching space is now larger--it will require a greater
+    number of calculations to match the queries to the keys. The hope is that the memory bandwidth
+    savings will outweigh the increased computational cost.
+    The same applies to the value-output space.
+    Note that, although the query-key and value-output spaces are now large, the low-rank
+    decomposition of the query heads and output heads ensures that the heads are still effectively
+    low rank / not over-parameterized.
+    Finally, note that this implementation also supports the optional use of shared spaces on
+    the query and output sides.
+    I've named the class "GroupedLatentAttention" because I may expand it to support multiple
+    key/value heads (i.e., multiple groups of query heads) in the future.
+    ==== Adding RoPE to VO ====
+    ### **Attempt**
+    We're extending Rotary Position Embeddings (RoPE) beyond the query-key interaction to the **value-output path** in Multihead Latent Attention (MLA).
+    * In DeepSeek-V3's MLA framing, the same **full-rank key/value head** provides both the keys (for patterns) and the values (for messages).
+    * Queries and output heads are low-rank bottlenecks, effectively serving as vocabularies of **pattern directions** (Q) and **message directions** (O).
+    * Standard RoPE only modulates the Q–K dot product. Our attempt is to also apply RoPE phases consistently in the V–O pathway, so that **positional dependence is preserved in both the matching (QK) and messaging (VO) sides**.
+    --
+    ### **Hypothesis**
+    If we rotate value vectors by their **source position phase** and then apply the **inverse rotation at the destination** before output projection, the model gains a clean **relative-position equivariance** in the message path, mirroring the property RoPE provides for queries and keys.
+    This should:
+    1. Make the 1-to-1 correspondence between "pattern templates" (Q) and "message templates" (O) more consistent.
+    2. Reduce the burden on output heads to learn ad-hoc positional compensation.
+    3. Improve long-context generalization, since both attention matching *and* message passing would share the same relative-position geometry.
+    """
+    def __init__(self, config: SharedSpaceDecoderConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        # Used to determine if this layer is dense or uses latents.
+        self.layer_idx = layer_idx
+        self.attention_dropout_prob = config.attention_dropout_prob
+        self.num_heads = config.num_attention_heads
+        self.rope_theta = config.rope_theta
+        self.rope_dims = config.rope_dims
+        self.nope_dims = config.nope_dims
+        self.q_shared_dim = config.q_shared_dim
+        # What was previously considered the key/value shared dimension is now the
+        # size of the MQA style single key/value head.
+        self.kv_head_dim = config.kv_shared_dim
+        self.o_shared_dim = config.o_shared_dim
+        # What was previously the query/key head size is now the size of
+        # the query head decomposition.
+        self.q_inner_dim = config.qk_private_dim
+        # What was previously the value/output head size is now the size of
+        # the output head decomposition.
+        self.o_inner_dim = config.vo_private_dim
+        self.hidden_size = config.hidden_size
+        # =========================
+        #     Input Projections
+        # =========================
+        # If this is one of the dense layers,
+        if self.layer_idx < config.num_dense_layers:
+            # =========================
+            #     Dense Attention
+            # =========================
+            # No latent projections.
+            self.latent_spaces = False
+            # Define the standard QKV projection
+            self.qkv_proj = nn.Linear(
+                config.hidden_size,
+                self.num_heads * (self.qk_private_dim * 2 + self.vo_private_dim),
+                bias=config.attention_bias,
+            )
+            # Dense output projection
+            self.o_proj = nn.Linear(
+                self.num_heads * self.vo_private_dim,
+                config.hidden_size,
+                bias=config.attention_bias,
+            )
+        # If we're past the dense layers,
+        else:
+            # =========================
+            #     Latent Attention
+            # =========================
+            # Use latent projections.
+            self.latent_spaces = True
+            # Input latent projections
+            print("config.q_shared_dim", config.q_shared_dim)
+            # ==========================
+            #     Shared Query Space
+            # ==========================
+            # If we're using a shared query subspace,
+            if config.q_shared_dim is not None:
+                # Set a flag that we'll check in `forward`.
+                self.query_shared = True
+                self.q_shared_proj = nn.Linear(
+                    config.hidden_size,
+                    self.q_shared_dim,
+                    bias=config.attention_bias,
+                )
+                self.q_shared_norm = create_norm_layer(self.q_shared_dim, config)
+            else:
+                print("Using identity for shared projection.")
+                # Set a flag that we'll check in `forward`.
+                self.query_shared = False
+                self.q_shared_dim = config.hidden_size
+                #print("Updated self.q_shared_dim to", self.q_shared_dim)
+                # Use identity.
+                self.q_shared_proj = nn.Identity()
+                self.q_shared_norm = nn.Identity()
+            # ==========================
+            #     Shared Output Space
+            # ==========================
+            # If we're using a shared output space,
+            if config.o_shared_dim is not None:
+                # Set a flag that we'll check in `forward`.
+                self.output_shared = True
+                # Shared output projection
+                # The head outputs from `o_private_proj` are first summed together (across
+                # heads) in the latent space.
+                # Then we project their combined outputs (a single vector per token)
+                # back to model space via `o_shared_proj`.
+                self.o_shared_proj = nn.Linear(
+                    self.o_shared_dim,
+                    self.hidden_size,
+                    bias=config.attention_bias
+                )
+                self.o_shared_norm = create_norm_layer(self.o_shared_dim, config)
+            else:
+                # Set a flag that we'll check in `forward`.
+                self.output_shared = False
+                self.o_shared_dim = config.hidden_size
+                # Use identity.
+                self.o_shared_proj = nn.Identity()
+                self.o_shared_norm = nn.Identity()
+            # ================================
+            #      Decomposed Query Heads
+            # ================================
+            # Query down projections.
+            # The query head inner dimension makes the head low rank, as usual.
+            self.q_priv_a_proj = nn.Linear(
+                self.q_shared_dim,
+                self.num_heads * self.q_inner_dim,
+                bias=False
+            )
+            # Query up projections.
+            # We project back to the larger key/value space.
+            # Rather than create a linear and break it apart, we can create our
+            # desired shapes.
+            #  per-head Dq_c -> Dkv     (store as [H, Dq_c, Dkv])
+            self.q_priv_b_weight = nn.Parameter(
+                torch.empty(self.num_heads, self.q_inner_dim, self.kv_head_dim)
+            )
+            nn.init.kaiming_uniform_(self.q_priv_b_weight, a=math.sqrt(5))
+            # ====================================
+            #      Single Joint Key/Value Head
+            # ====================================
+            # The single joint key/value head.
+            self.kv_priv_proj = nn.Linear(
+                self.hidden_size,
+                self.kv_head_dim,
+                bias=False,
+            )
+            self.kv_priv_norm = create_norm_layer(self.kv_head_dim, config)
+            # ================================
+            #      Decomposed Output Heads
+            # ================================
+            # Down: values [B,H,T,Dkv] -> per-head Do_c using weights [H, Dkv, Do_c]
+            self.o_priv_a_weight = nn.Parameter(
+                torch.empty(self.num_heads, self.kv_head_dim, self.o_inner_dim)
+            )
+            nn.init.kaiming_uniform_(self.o_priv_a_weight, a=math.sqrt(5))
+            # Output up projections.
+            # We project back to the larger output subspace (or the model space,
+            # if no subspace is used).
+            self.o_priv_b_proj = nn.Linear(
+                self.num_heads * self.o_inner_dim,
+                self.o_shared_dim,
+                bias=False
+            )
+        # Let SDPA choose 1/sqrt(E). If you want explicit: self.kv_head_dim ** -0.5
+        self.softmax_scale = None
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        #past_key_value: Optional[Cache] = None, # TODO - Can I remove this?
+        #cache_position: Optional[torch.LongTensor] = None, # TODO - Can I remove this?
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        # === Tensor Dimension Symbols ===
+        #     B: batch_size     — number of samples in the batch
+        #     T: seq_len        — number of tokens per sample
+        #     H: n_heads        — number of attention heads
+        #     D: hidden_dim     — model embedding size
+        #  Dq_c: q_inner_dim    - per-head decomposition dim for Q
+        Dq_c = self.q_inner_dim        # per-head inner dim for Q
+        #  Do_c: o_inner_dim    - per-head decomposition dim for O
+        Do_c = self.o_inner_dim        # per-head inner dim for O
+        #   Dkv: kv_head_dim    - Head size of the joint key/value head
+        Dkv  = self.kv_head_dim        # Head size of the joint key/value head
+        #    Dr: rope_dims      - The first Dr dimensions receive rope.
+        #  Dq_s: q_shared_dim   - query shared subspace size
+        Dq_s = self.q_shared_dim
+        #  Do_s: o_shared_dim   - output shared subspace size
+        Do_s = self.o_shared_dim
+        # Input token embeddings
+        # hidden_states: [B, T, D]
+        B, T = hidden_states.shape[:2]
+        H = self.num_heads
+        # =============================
+        #     Shared Query Space
+        # =============================
+        # These are set to identity if no shared query space is used.
+        # Project token embeddings into shared latents
+        # Input:
+        #     hidden_states [B, T, D]
+        #     q_shared_proj [D, Dq_s]
+        #    kv_shared_proj [D, Dkv]
+        # Output:
+        #          q_shared  [B, T, Dq_s]
+        #          kv_shared [B, T, Dkv]
+        q_shared = self.q_shared_proj(hidden_states)
+        # Normalize latent vectors, shapes unchanged.
+        q_shared = self.q_shared_norm(q_shared)
+        # ================================
+        #     Decomposed Query Heads
+        # ================================
+        # Project query latents onto decomposed query heads.
+        #
+        # Down projection ('a')
+        # Input:
+        #     q_shared       [B, T, Dq_s]
+        #     q_priv_a_proj [Dq_s, H*Dq_c]
+        # Output:
+        #     queries_c   [B, T, H*Dq_c]
+        queries_c = self.q_priv_a_proj(q_shared)
+        # Split the vectors by head
+        # Input:
+        #     queries_c        [B, T, H*Dq_c]
+        # Output:
+        #     queries_c   [B, T, H, Dq_c]
+        queries_c = queries_c.view(B, T, H, Dq_c)
+        # Up projection ('b')
+        # Input:
+        #     queries_c        [B, T, H, Dq_c]
+        #     q_priv_b_weight        [H, Dq_c, Dkv]
+        # Output:
+        #     queries     [B, H, T, Dkv]
+        queries = torch.einsum("bthd,hdc->bhtc", queries_c, self.q_priv_b_weight)
+        # ===================================
+        #     Single Joint Key/Value Head
+        # ===================================
+        # Project token embeddings into single joint key/value head.
+        # Input:
+        #     hidden_states [B, T, D]
+        #     kv_priv_proj [D, Dkv]
+        # Output:
+        #     keyvalue [B, T, Dkv]
+        keyvalue = self.kv_priv_proj(hidden_states)
+        # Apply QK normalization.
+        keyvalue = self.kv_priv_norm(keyvalue)
+        # Prepare the queries and keyvalue vectors for RoPE and flash attention.
+        # We have multiple query heads, and the queries are in `queries`.
+        # We have a single key head, and the keyvector is in `keyvalue`.
+        # Move the head dimension to the front, so for each head, we have
+        # a series of vectors for each token in the sequence.
+        #
+        # Inputs:
+        #   keyvalue  [B, T, Dkv]
+        # Output:
+        #   keyvalue   [B, 1, T, Dkv]
+        keyvalue = keyvalue.unsqueeze(1)
+        # ==================
+        #        RoPE
+        # ==================
+        # Apply rotary position embeddings to the first `self.rope_dims` of
+        # each head.
+        # The slice operations are free, but the concatenation is
+        # not, because the outputs of the rotation operation are new data
+        # occupying different memory. Still considered the best option,
+        # though.
+        # 1. Unpack the precomputed cosine and sine embeddings
+        # Position embeddings is a tuple of
+        #    (cos [seq_len, rope_dims],
+        #     sin [seq_len, rope_dims])
+        cos, sin = position_embeddings
+        # 2. Split the query and key heads into the part to rotate and the part
+        #    to pass through (early columns get position info, later ones don't)
+        #
+        #  (Using queries as example)
+        #  Inputs:
+        #    queries  [B, H, T, Dkv]  Dkv = rope_dims + not_rope_dims
+        #  Outputs:
+        #    q_rope   [B, H, T, Dr]
+        #    q_pass   [B, H, T, Dkv-Dr]
+        q_rope, q_pass = queries[..., :self.rope_dims], queries[..., self.rope_dims:]
+        k_rope, k_pass =   keyvalue[..., :self.rope_dims],   keyvalue[..., self.rope_dims:]
+        # 3. Apply the rotary embedding to the designated slice
+        #
+        # To broadcast cos and sin across the batch and head dimensions, we unsqueeze them.
+        # Shape change: [T, Dr] -> [1, 1, T, Dr]
+        cos = cos.unsqueeze(0).unsqueeze(0)
+        sin = sin.unsqueeze(0).unsqueeze(0)
+        #print("q_rope.shape[-1] // 2:", (q_rope.shape[-1] // 2))
+        #print("x1 = x[..., :x.shape[-1] // 2 ].shape:", q_rope[..., :q_rope.shape[-1] // 2 ].shape)
+        #print("sin/cos.shape:", cos.shape)
+        #print("q_rope.shape:", q_rope.shape)
+        #print("(q_rope * cos).shape:", (q_rope * cos).shape)
+        #print("rotate_half(q_rope).shape:", rotate_half(q_rope).shape)
+        #print("(rotate_half(q_rope) * sin).shape:", (rotate_half(q_rope) * sin).shape)
+        """
+        In this example   batch_size = 2, hum_heads = 8, seq_len = 65, rope_dims = 16
+                        q_rope.shape[-1] // 2: 8
+        x1 = x[..., :x.shape[-1] // 2 ].shape: torch.Size([2, 8, 65, 8])
+                    sin/cos.shape: torch.Size([1, 1, 65, 16])  # After double unsqueeze.
+                    vq_rope.shape: torch.Size([2, 8, 65, 16])
+             (q_rope * cos).shape: torch.Size([2, 8, 65, 16])
+        rotate_half(q_rope).shape: torch.Size([2, 8, 65, 16])
+        (rotate_half(q_rope) * sin).shape: torch.Size([2, 8, 65, 16])
+        """
+        # Let's walk through the queries as the example.
+        # What does rotate half do?
+        #    dim -1 is the row vectors, the queries
+        #
+        #  Step 1: Split the vector in half.
+        #    "q_rope.shape[-1] // 2" <- How much to select. Half the length of the q_rope vector
+        #    x1 = x[..., :x.shape[-1] // 2 ]  # Select the first half of the vector.
+        #    x2 = x[...,  x.shape[-1] // 2:]  # Select the second half.
+        #
+        #  Step 2:
+        #      - Apply negative to the values in the second half.
+        #      - Reverse the order of the halves.
+        #    return torch.cat((-x2, x1), dim=-1)
+        #
+        # ---- (q_rope * cos) ----
+        # Element-wise multiply the values in each `cos` vector with the
+        # corresponding (i.e., same sequence position) `q_rope` vector.
+        #
+        # Inputs:
+        #    q_rope  [B, H, T, Dr]
+        #       cos  [1, 1, T, Dr]
+        #
+        # Outputs:
+        #        x   [B, H, T, Dr]
+        #
+        # ---- (rotate_half(q_rope)) ----
+        #  TODO
+        #
+        # Inputs:
+        #       q_rope    [B, T, Dr]
+        #
+        # Outputs:
+        #   rot_q_rope    [B, T, Dr]
+        #
+        # ---- rotated * sin ----
+        #  TODO
+        q_rotated = (q_rope * cos) + (rotate_half(q_rope) * sin)
+        k_rotated = (k_rope * cos) + (rotate_half(k_rope) * sin)
+        # 4. Concatenate the rotated and pass-through parts back together
+        # Input (each): [B, H, T, Dr] and [B, H, T, Dkv-Dr]
+        # Output (each): [B, H, T, Dkv]
+        # (Where h = 1 for the key head and h = num_heads for the query heads)
+        queries = torch.cat((q_rotated, q_pass), dim=-1)
+        keyvalue = torch.cat((k_rotated, k_pass), dim=-1)
+        # ====================
+        #      GQA / MQA
+        # ====================
+        # GPT says that flash attention will infer the broadcasting, so `expand` is not needed.
+        #
+        # We need to use the `expand` operation to broadcast the keyvalue vector
+        # across the query heads.
+        # Input:
+        #     keyvalue [B, 1, T, Dkv]
+        # Output:
+        #     keyvalue [B, H, T, Dkv]
+        #keyvalue = keyvalue.expand(-1, H, -1, -1)
+        # ===================
+        #       Attention
+        # ===================
+        # We're ready for the attention score calculation.
+        # Only apply dropout during training.
+        # self.training is a pytorch flag.
+        if self.training:
+            dropout_p = self.attention_dropout_prob
+        else:
+            dropout_p = 0.0
+        # Call SDPA / Flash Attention
+        # https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
+        # Apply MQA / GQA. In this case, we have a single key head, and multiple query heads.
+        values = F.scaled_dot_product_attention(
+            queries,
+            keyvalue, # Single key vector (joint with value) for GQA / MQA.
+            keyvalue, # Single value vector (joint with key) for GQA / MQA.
+            attn_mask=None, # attention_mask,
+            dropout_p=dropout_p,
+            scale=self.softmax_scale,
+            is_causal=True, # This is a decoder - apply causal masking
+        )
+        # Attention outputs:
+        # values [B, H, T, Dkv]
+        # The final Dr dims of the value vectors carry RoPE information.
+        # We can either (1) add position dependence to the value-output process,
+        # or (2) we can strip off the RoPE information and only use the non-RoPE parts.
+        # Let's try option 1!
+        # Split the values into the RoPE and non-RoPE parts.
+        # Input:
+        #     values [B, H, T, Dkv]
+        # Output:
+        #     values_rope [B, H, T, Dr]
+        #     values_pass [B, H, T, Dkv-Dr]
+        values_rope, values_pass = values[..., :self.rope_dims], values[..., self.rope_dims:]
+        # Fold the query RoPE information into the value vectors.
+        # Inverse rotation: R_{-θ} x  =  (x * cos)  - (rotate_half(x) * sin)
+        # Input:
+        #     values_rope [B, H, T, Dr]
+        #            cos  [1, 1, T, Dr]
+        #            sin  [1, 1, T, Dr]
+        # Output:
+        #     values_unrot [B, H, T, Dr]
+        values_unrot = (values_rope * cos) - (rotate_half(values_rope) * sin)
+        # Now the values have the offset information in their rope dimensions,
+        # and the output heads can learn to use it.
+        values = torch.cat((values_unrot, values_pass), dim=-1)  # [B,H,T,Dkv]
+        # =========================
+        #     Output Projection
+        # =========================
+        # Project the values onto the decomposed output heads.
+        # Output down projection heads.
+        # Input:
+        #            values  [B, H, T, Dkv]
+        #   o_priv_a_weight     [H, Dkv, Do_c]
+        # Output:
+        #         outputs_c  [B, H, T, Do_c]
+        outputs_c = torch.einsum("bhtd,hdc->bhtc", values, self.o_priv_a_weight)
+        # For the up projection, we can concatenate the 'outputs_c' vectors by head,
+        # (in the same way we would usually concatenate the value vectors)
+        # Input:
+        #    outputs_c  [B, H, T, Do_c]
+        # Output:
+        #   outputs_c  [B, T, H*Do_c]
+        outputs_c = outputs_c.permute(0, 2, 1, 3).contiguous().view(B, T, H * Do_c)
+        # Project up to the shared output space and sum across the output heads.
+        # Input:
+        #    outputs_c  [B, T, H*Do_c]
+        #    o_priv_b_proj [H*Do_c, Do_s]
+        # Output:
+        #    output_s  [B, T, Do_s]
+        output_s = self.o_priv_b_proj(outputs_c)
+        # Apply normalization to the output latents
+        output_s = self.o_shared_norm(output_s)
+        # Re-project the output latent representation back to model space.
+        # Input:
+        #    output_s      [B, T, Do_s]
+        #    o_shared_proj [Do_s, D]
+        # Output:
+        #    attn_output   [B, T, D]
+        attn_output = self.o_shared_proj(output_s)
+        # TODO - Not currently supported.
+        # If this is a dense layer,
+        # Project the values back into model space.
+        # attn_output = self.o_proj(attn_output)
+        # -----------------------------------------
+        return attn_output

checkpoint-3000/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-3000/mla.py ADDED Viewed

	@@ -0,0 +1,619 @@

+"""# ▂▂▂▂▂▂▂▂▂▂▂▂
+# `mla.py`
+Based on: https://huggingface.co/deepseek-ai/DeepSeek-R1/blob/main/modeling_deepseek.py
+## RotaryEmbedding
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional
+from .shared_space_config import SharedSpaceDecoderConfig
+def create_norm_layer(hidden_size: int, config: SharedSpaceDecoderConfig) -> nn.Module:
+    """
+    Create a normalization layer based on the config norm_type.
+    If `hidden_size` is `None`, this returns an identity layer.
+    Args:
+        hidden_size: The dimension to normalize over
+        config: Configuration containing norm_type and epsilon values
+    Returns:
+        Either a LayerNorm or RMSNorm layer
+    """
+    if hidden_size is None:
+        return nn.Identity()
+    elif config.norm_type == "layernorm":
+        return nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
+    elif config.norm_type == "rmsnorm":
+        return DeepseekV3RMSNorm(hidden_size, eps=config.rms_norm_eps)
+    else:
+        # This should be caught by config validation, but being defensive
+        raise ValueError(f"Unknown norm_type: {config.norm_type}")
+# TODO - Find a shared place to put this.
+class DeepseekV3RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        DeepseekV3RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+# Helper function needed because it's called twice during RoPE,
+# but I dumped it in the comments there.
+# TODO - Nah, screw it, just write it twice! At least then you get
+# to use the word 'query' instead of 'x'.
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+class RotaryEmbedding(nn.Module):
+    """Precompute RoPE embeddings and store them as buffers."""
+    def __init__(self, config: SharedSpaceDecoderConfig) -> None:
+        super().__init__()
+        dim = config.rope_dims
+        seq_len = config.max_position_embeddings
+        # ------------------------------
+        # Compute inverse frequencies
+        # ------------------------------
+        # Shape: [dim // 2]
+        #   inv_freq[i] = 1 / (theta^(i / dim))
+        inv_freq = 1.0 / (
+            config.rope_theta
+            ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)
+        )
+        # ------------------------------
+        # Apply RoPE scaling if configured
+        # ------------------------------
+        if config.rope_scaling is not None:
+            scaling_type = config.rope_scaling.get("type", "linear")
+            scaling_factor = config.rope_scaling.get("factor", 1.0)
+            if scaling_type == "linear":
+                # Linear scaling: divide frequencies by scaling factor
+                inv_freq = inv_freq / scaling_factor
+            elif scaling_type == "dynamic":
+                # Dynamic scaling: adjust based on sequence length
+                # This is a simplified implementation
+                inv_freq = inv_freq / scaling_factor
+            else:
+                print(f"Warning: Unknown RoPE scaling type '{scaling_type}', using linear scaling")
+                inv_freq = inv_freq / scaling_factor
+        # ------------------------------
+        # Compute position indices
+        # ------------------------------
+        # Shape: [seq_len]
+        t = torch.arange(seq_len, dtype=torch.float32)
+        # ------------------------------
+        # Outer product: [seq_len, dim // 2]
+        # Each row i contains: t[i] * inv_freq
+        # ------------------------------
+        freqs = torch.outer(t, inv_freq)
+        # ------------------------------
+        # Duplicate for interleaved sin/cos: [seq_len, dim]
+        # This matches the common format: [sin_0, cos_0, sin_1, cos_1, ...]
+        # ------------------------------
+        emb = torch.cat((freqs, freqs), dim=-1)
+        # ------------------------------
+        # Register cos/sin as buffers
+        # - Stored in float32
+        # - Will be moved to correct device/dtype via model.to(...)
+        # - Not saved with state_dict (persistent=False)
+        # ------------------------------
+        self.register_buffer("cos", emb.cos(), persistent=False)
+        self.register_buffer("sin", emb.sin(), persistent=False)
+    def forward(self, position_ids: torch.LongTensor) -> tuple[torch.Tensor, torch.Tensor]:
+        """ """
+        return None # This function is not necessary.
+"""## MLA"""
+class MultiheadLatentAttention(nn.Module):
+    """
+    A variant of MLA with:
+    - Simplified RoPE handling:
+      - A portion of the head dimensions are used for position information.
+      - Same number of queries as keys. (no MQA)
+    - Optional output subspace
+    """
+    def __init__(self, config: SharedSpaceDecoderConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        # Used to determine if this layer is dense or uses latents.
+        self.layer_idx = layer_idx
+        self.attention_dropout_prob = config.attention_dropout_prob
+        self.num_heads = config.num_attention_heads
+        self.rope_theta = config.rope_theta
+        self.rope_dims = config.rope_dims
+        self.nope_dims = config.nope_dims
+        self.q_shared_dim = config.q_shared_dim
+        self.kv_shared_dim = config.kv_shared_dim
+        self.o_shared_dim = config.o_shared_dim
+        self.qk_private_dim = config.qk_private_dim
+        self.vo_private_dim = config.vo_private_dim
+        self.hidden_size = config.hidden_size
+        # =========================
+        #     Input Projections
+        # =========================
+        # If this is one of the dense layers,
+        if self.layer_idx < config.num_dense_layers:
+            # =========================
+            #     Dense Attention
+            # =========================
+            # No latent projections.
+            self.latent_spaces = False
+            # Define the standard QKV projection
+            self.qkv_proj = nn.Linear(
+                config.hidden_size,
+                self.num_heads * (self.qk_private_dim * 2 + self.vo_private_dim),
+                bias=config.attention_bias,
+            )
+            # Dense output projection
+            self.o_proj = nn.Linear(
+                self.num_heads * self.vo_private_dim,
+                config.hidden_size,
+                bias=config.attention_bias,
+            )
+        # If we're past the dense layers,
+        else:
+            # =========================
+            #     Latent Attention
+            # =========================
+            # Use latent projections.
+            self.latent_spaces = True
+            # Input latent projections
+            print("config.q_shared_dim", config.q_shared_dim)
+            # If we're using a shared query subspace,
+            if config.q_shared_dim is not None:
+                # Set a flag that we'll check in `forward`.
+                self.query_shared = True
+                self.q_shared_proj = nn.Linear(
+                    config.hidden_size,
+                    self.q_shared_dim,
+                    bias=config.attention_bias,
+                )
+                self.q_shared_norm = create_norm_layer(self.q_shared_dim, config)
+            else:
+                print("Using identity for shared projection.")
+                # Set a flag that we'll check in `forward`.
+                self.query_shared = False
+                self.q_shared_dim = config.hidden_size
+                #print("Updated self.q_shared_dim to", self.q_shared_dim)
+                # Use identity.
+                self.q_shared_proj = nn.Identity()
+                self.q_shared_norm = nn.Identity()
+            # If we're using a shared key/value subspace,
+            if config.kv_shared_dim is not None:
+                # Set a flag that we'll check in `forward`.
+                self.keyvalue_shared = True
+                self.kv_shared_proj = nn.Linear(
+                    config.hidden_size,
+                    self.kv_shared_dim,
+                    bias=config.attention_bias,
+                )
+                self.kv_shared_norm = create_norm_layer(self.kv_shared_dim, config)
+            else:
+                # Set a flag that we'll check in `forward`.
+                self.keyvalue_shared = False
+                self.kv_shared_dim = config.hidden_size
+                # Use identity.
+                self.kv_shared_proj = nn.Identity()
+                self.kv_shared_norm = nn.Identity()
+            #print("config.q_shared_dim", config.q_shared_dim)
+            #print("self.qk_private_dim", self.qk_private_dim)
+            # Query heads
+            self.q_private_proj = nn.Linear(
+                self.q_shared_dim,
+                self.num_heads * self.qk_private_dim,
+                bias=False # TODO
+            )
+            # Key and Value heads, concatenated
+            self.kv_private_proj = nn.Linear(
+                self.kv_shared_dim,
+                self.num_heads * (self.qk_private_dim + self.vo_private_dim),
+                bias=False,
+            )
+            # Use output subspace if o_shared_dim is specified
+            self.output_subspace = config.o_shared_dim is not None
+            # If we're using an output subspace,
+            if self.output_subspace:
+                # ==========================
+                #     Output Subspace
+                # ==========================
+                self.o_shared_dim = config.o_shared_dim
+                # Per-head output projections
+                # (Similar to original W^O, but projects the scored value vectors
+                #  into a latent space instead of back to the model)
+                self.o_private_proj = nn.Linear(
+                    self.num_heads * self.vo_private_dim,
+                    self.o_shared_dim,
+                    bias=False
+                )
+                # Norm layer between o_private_proj and o_shared_proj
+                # Note: In previous ViT experiments, this norm step hurt performance, but was beneficial
+                #       in the DeepSeekV3 experiments.
+                # However, we're making it configurable so it can be tested in different contexts.
+                self.o_private_norm = create_norm_layer(self.o_shared_dim, config)
+                # Shared output projection
+                # The head outputs from `o_private_proj` are first summed together (across
+                # heads) in the latent space.
+                # Then we project their combined outputs (a single vector per token)
+                # back to model space via `o_shared_proj`.
+                self.o_shared_proj = nn.Linear(
+                    self.o_shared_dim,
+                    self.hidden_size,
+                    bias=config.attention_bias
+                )
+            else:
+                # Dense output projection
+                self.o_proj = nn.Linear(
+                    self.num_heads * self.vo_private_dim,
+                    config.hidden_size,
+                    bias=config.attention_bias,
+                )
+        # Softmax scaling factor.
+        self.softmax_scale = self.qk_private_dim ** (-0.5)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        #past_key_value: Optional[Cache] = None, # TODO - Can I remove this?
+        #cache_position: Optional[torch.LongTensor] = None, # TODO - Can I remove this?
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        # === Tensor Dimension Symbols ===
+        #    B: batch_size     — number of samples in the batch
+        #    T: seq_len        — number of tokens per sample
+        #    H: n_heads        — number of attention heads
+        #    D: hidden_dim     — model embedding size
+        #   Dv: vo_private_dim - per-head value/output projection dimension
+        #   Dr: rope_dims      - The first Dr dimensions receive rope.
+        #   Cq: q_shared_dim   - query shared subspace size
+        #  Ckv: kv_shared_dim  - key-value shared subspace size
+        #   Co: o_shared_dim   - output shared subspace size
+        # Input token embeddings
+        # hidden_states: [B, T, D]
+        B, T = hidden_states.shape[:2]
+        H = self.num_heads
+        Dq = self.qk_private_dim     # per-head dim for Q and K
+        Dv = self.vo_private_dim     # per-head dim for V/O
+        Dc_q, Dc_kv = self.q_shared_dim, self.kv_shared_dim
+        # ==============================
+        #      QKV Head Projections
+        # ==============================
+        # Project tokens into per-head query, key, and value vectors
+        # If this layer uses latent projections,
+        if self.latent_spaces:
+            # ================================
+            #     Shared Space Projections
+            # ================================
+            # Project token embeddings into shared latents
+            # Input:
+            #     hidden_states [B, T, D]
+            #     q_shared_proj [D, Cq]
+            #    kv_shared_proj [D, Ckv]
+            # Output:
+            #          q_shared  [B, T, Cq]
+            #          kv_shared [B, T, Ckv]
+            # If we're using a shared query subspace,
+            if self.q_shared_dim is not None:
+                q_shared = self.q_shared_proj(hidden_states)
+                # Normalize latent vectors, shapes unchanged.
+                q_shared = self.q_shared_norm(q_shared)
+            # Otherwise,
+            else:
+                # Use the hidden states
+                q_shared = hidden_states
+            # If we're using a shared key/value subspace,
+            if self.kv_shared_dim is not None:
+                # Project token embeddings into shared subspace.
+                kv_shared = self.kv_shared_proj(hidden_states)
+                # Normalize latent vectors, shapes unchanged.
+                kv_shared = self.kv_shared_norm(kv_shared)
+            # Otherwise,
+            else:
+                # Use the hidden states
+                kv_shared = hidden_states
+            # ======================================
+            #     Per-Head (Private) Projections
+            # ======================================
+            # Project query latents onto query heads.
+            # Input:
+            #     q_shared       [B, T, Cq]
+            #     q_private_proj [Cq, H*Dh]
+            # Output:
+            #     queries   [B, T, H*Dh]
+            queries = self.q_private_proj(q_shared)
+            # Project key/value latents onto key and value heads.
+            # The key and value heads are all concatenated, each head occupies
+            # Dh columns of the kv_private_proj. This yields the key and value
+            # vectors concatenated in the same way.
+            #
+            # Input:
+            #          kv_shared [B, T, Ckv]
+            #    kv_private_proj [Ckv, 2*H*Dh]
+            # Output:
+            #     keysvalues [B, T, 2*H*Dh]
+            keysvalues = self.kv_private_proj(kv_shared)
+            # Split into key and value tensors
+            # Each: [B, T, H * Dh]
+            keys, values = keysvalues.chunk(2, dim=-1)
+        # If this is a dense attention layer (no latent projections),
+        else:
+            # ====================
+            #     Standard MHA
+            # ====================
+            # Standard QKV projection
+            # Input:
+            #   hidden_states     [B, T, D]
+            #         qkv_proj    [D, 3*H*Dh]
+            # Output:
+            #   querieskeysvalues [B, T, 3*H*Dh]
+            querieskeysvalues = self.qkv_proj(hidden_states)
+            # Separate query, key, and value vectors
+            # Each: [B, T, H * Dh]
+            queries, keys, values = querieskeysvalues.chunk(3, dim=-1)
+        # Split up queries so that there's just one per row.
+        # Same for keys and values.
+        #
+        # Inputs:
+        #   Each  [B, T, H*Dh]
+        # Output:
+        #   Each  [B, H,  T,  Dh]
+        queries = queries.view(B, T, H, Dq).transpose(1, 2)
+        keys =       keys.view(B, T, H, Dq).transpose(1, 2)
+        values =   values.view(B, T, H, Dv).transpose(1, 2)
+        # ==================
+        #        RoPE
+        # ==================
+        # Apply rotary position embeddings to the first `self.rope_dims` of
+        # each head.
+        # The slice operations are free, but the concatenation is
+        # not, because the outputs of the rotation operation are new data
+        # occupying different memory. Still considered the best option,
+        # though.
+        # 1. Unpack the precomputed cosine and sine embeddings
+        # Position embeddings is a tuple of
+        #    (cos [seq_len, rope_dims],
+        #     sin [seq_len, rope_dims])
+        cos, sin = position_embeddings
+        # 2. Split the query and key heads into the part to rotate and the part
+        #    to pass through (early columns get position info, later ones don't)
+        #
+        #  (Using queries as example)
+        #  Inputs:
+        #    queries  [B, H, T, Dh]  Dh = rope_dims + not_rope_dims
+        #  Outputs:
+        #    q_rope   [B, H, T,  Dr]
+        #    q_pass   [B, H, T, Dh-Dr]
+        q_rope, q_pass = queries[..., :self.rope_dims], queries[..., self.rope_dims:]
+        k_rope, k_pass =    keys[..., :self.rope_dims],    keys[..., self.rope_dims:]
+        # 3. Apply the rotary embedding to the designated slice
+        #
+        # To broadcast cos and sin across the batch and head dimensions, we unsqueeze them.
+        # Shape change: [T, Dr] -> [1, 1, T, Dr]
+        cos = cos.unsqueeze(0).unsqueeze(0)
+        sin = sin.unsqueeze(0).unsqueeze(0)
+        #print("q_rope.shape[-1] // 2:", (q_rope.shape[-1] // 2))
+        #print("x1 = x[..., :x.shape[-1] // 2 ].shape:", q_rope[..., :q_rope.shape[-1] // 2 ].shape)
+        #print("sin/cos.shape:", cos.shape)
+        #print("q_rope.shape:", q_rope.shape)
+        #print("(q_rope * cos).shape:", (q_rope * cos).shape)
+        #print("rotate_half(q_rope).shape:", rotate_half(q_rope).shape)
+        #print("(rotate_half(q_rope) * sin).shape:", (rotate_half(q_rope) * sin).shape)
+        """
+        In this example   batch_size = 2, hum_heads = 8, seq_len = 65, rope_dims = 16
+                        q_rope.shape[-1] // 2: 8
+        x1 = x[..., :x.shape[-1] // 2 ].shape: torch.Size([2, 8, 65, 8])
+                    sin/cos.shape: torch.Size([1, 1, 65, 16])  # After double unsqueeze.
+                    vq_rope.shape: torch.Size([2, 8, 65, 16])
+             (q_rope * cos).shape: torch.Size([2, 8, 65, 16])
+        rotate_half(q_rope).shape: torch.Size([2, 8, 65, 16])
+        (rotate_half(q_rope) * sin).shape: torch.Size([2, 8, 65, 16])
+        """
+        # Let's walk through the queries as the example.
+        # What does rotate half do?
+        #    dim -1 is the row vectors, the queries
+        #
+        #  Step 1: Split the vector in half.
+        #    "q_rope.shape[-1] // 2" <- How much to select. Half the length of the q_rope vector
+        #    x1 = x[..., :x.shape[-1] // 2 ]  # Select the first half of the vector.
+        #    x2 = x[...,  x.shape[-1] // 2:]  # Select the second half.
+        #
+        #  Step 2:
+        #      - Apply negative to the values in the second half.
+        #      - Reverse the order of the halves.
+        #    return torch.cat((-x2, x1), dim=-1)
+        #
+        # ---- (q_rope * cos) ----
+        # Element-wise multiply the values in each `cos` vector with the
+        # corresponding (i.e., same sequence position) `q_rope` vector.
+        #
+        # Inputs:
+        #    q_rope  [B, H, T, Dr]
+        #       cos  [1, 1, T, Dr]
+        #
+        # Outputs:
+        #        x   [B, H, T, Dr]
+        #
+        # ---- (rotate_half(q_rope)) ----
+        #  TODO
+        #
+        # Inputs:
+        #       q_rope    [B, T, Dr]
+        #
+        # Outputs:
+        #   rot_q_rope    [B, T, Dr]
+        #
+        # ---- rotated * sin ----
+        #  TODO
+        q_rotated = (q_rope * cos) + (rotate_half(q_rope) * sin)
+        k_rotated = (k_rope * cos) + (rotate_half(k_rope) * sin)
+        # 4. Concatenate the rotated and pass-through parts back together
+        # Input (each): [B, H, T, Dr] and [B, H, T, Dq-Dr]
+        # Output (each): [B, H, T, Dq]
+        queries = torch.cat((q_rotated, q_pass), dim=-1)
+        keys = torch.cat((k_rotated, k_pass), dim=-1)
+        # ===================
+        #       Attention
+        # ===================
+        # The tensors (queries, keys, values) now have shape [B, H, T, Dq]
+        # and are ready for the attention score calculation.
+        # Only apply dropout during training.
+        # self.training is a pytorch flag.
+        if self.training:
+            dropout_p = self.attention_dropout_prob
+        else:
+            dropout_p = 0.0
+        # Call SDPA / Flash Attention
+        # https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
+        attn_output = F.scaled_dot_product_attention(
+            queries,
+            keys,
+            values,
+            attn_mask=None, # attention_mask,
+            dropout_p=dropout_p,
+            scale=self.softmax_scale,
+            is_causal=True, # This is a decoder - apply causal masking
+        )
+        # Reshape output back to [B, T, H * Dv] from [B, H, T, Dv]
+        attn_output = attn_output.transpose(1, 2).contiguous().view(B, T, H * Dv)
+        # =========================
+        #     Output Projection
+        # =========================
+        # If we are using an output latent projection,
+        if self.latent_spaces and self.output_subspace:
+            # Project the attention output into the output latent space.
+            # This is analogous to the W^O matrix in standard attention but
+            # projects to an intermediate latent dimension.
+            attn_output = self.o_private_proj(attn_output)
+            # Apply normalization to the output latents
+            attn_output = self.o_private_norm(attn_output)
+            # Re-project the output latent representation back to model space.
+            attn_output = self.o_shared_proj(attn_output)
+        # If this is a dense layer,
+        else:
+            # Project the values back into model space.
+            attn_output = self.o_proj(attn_output)
+        # -----------------------------------------
+        return attn_output

checkpoint-3000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f6919065c808b2b25bd22da1e5d8cd5f6ee1111b1996e2c9ef132a5bd57e383d
+size 988989899

checkpoint-3000/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:02c749023ca02a2260541bcbe926f5ccb8aaea29a6b8d40e44cffcc695c4bdf0
+size 494483579

checkpoint-3000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0cd23626013be2cf1b8f351eb1880911bc653b0d6a7fc40aa3f0c07b9f92b902
+size 14645

checkpoint-3000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:de993877decafa3dbec3a2190f9cbfc7ce6efc997a47f7abd25897f127fbf6ba
+size 1465

checkpoint-3000/shared_space_config.py ADDED Viewed

	@@ -0,0 +1,329 @@

+"""# `shared_space_config.py`
+#### `*Config`
+"""
+from typing import Optional
+import torch
+from torch import nn
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_utils import PreTrainedModel
+"""`def make_shorthand`"""
+def make_shorthand(model_cfg):
+    """
+    Takes an instance subencoder `*Config` and constructs a shorthand
+    name for the model based on settings.
+    """
+    dense_str = str(model_cfg.num_dense_layers) + "mha + "
+    if model_cfg.o_shared_dim is not None:
+        o_str = "." + str(model_cfg.o_shared_dim)
+    else:
+        o_str = ""
+    # If no output subspace is used, the dimension will show as -1.
+    attn_str = (
+        dense_str
+        + "mla."
+        + str(model_cfg.q_shared_dim)
+        + "."
+        + str(model_cfg.kv_shared_dim)
+        + o_str
+    )
+    # MLP Configuration
+    if model_cfg.ffn_decompose:
+        dense_str = (
+            str(model_cfg.num_dense_layers)
+            + "mlp."
+            + str(model_cfg.intermediate_size)
+            + " + "
+        )
+        mlp_str = (
+            dense_str
+            + str(model_cfg.num_hidden_layers - model_cfg.num_dense_layers)
+            + "dcmp."
+            + "x"
+            + str(model_cfg.intermediate_size)
+            + "."
+            + str(model_cfg.ffn_rank)
+        )
+    else:
+        mlp_str = "mlp." + str(model_cfg.intermediate_size)
+    # Assemble string
+    shorthand = (
+        f"{attn_str} - {mlp_str} - "
+        f"h{model_cfg.hidden_size} - l{model_cfg.num_hidden_layers}"
+    )
+    """
+    The run name includes training settings
+    run_name = (
+        f"{config['stats']['total_elements']} - "
+        f"{attn_str} - {mlp_str} - "
+        f"h{model_cfg.hidden_size} - l{model_cfg.num_hidden_layers} - "
+        f"bs{ptrain_cfg['train_batch_size']} - lr{lr_str} - "
+        f"seq{ptrain_cfg['max_seq_length']}"
+    )
+    """
+    return shorthand
+class SharedSpaceDecoderConfig(PretrainedConfig):
+    r"""
+    Configuration class for SharedSpaceDecoderConfig.
+    Extends the HuggingFace `PretrainedConfig` to support architectural
+    variations including:
+    - Multi-Head Latent Attention (MLA)
+    - Decomposed MLPs (low-rank FFNs)
+    - Flexible attention backends (eager, flash, sdpa)
+    - Explicit shared subspaces for Q, K, V, and O projections
+    This config does not infer any defaults based on `hidden_size`. All
+    dimensions and ranks must be explicitly specified. If required values are
+    missing, a `ValueError` is raised during initialization.
+    ----------------------
+    Core Model Parameters:
+    ----------------------
+    - vocab_size (`int`) — Vocabulary size.
+    - hidden_size (`int`) — Model hidden dimension.
+    - num_hidden_layers (`int`) — Number of transformer blocks.
+    - intermediate_size (`int`) — Feed-forward hidden dimension.
+    - hidden_act (`str`) — Activation function.
+    - hidden_dropout_prob (`float`) — Dropout after projections and FFNs.
+    - attention_dropout_prob (`float`) — Dropout applied to attention scores.
+    - max_position_embeddings (`int`) — Max sequence length.
+    - initializer_range (`float`) — Stddev of weight init.
+    - layer_norm_eps (`float`) — Epsilon for LayerNorm.
+    - rms_norm_ps (`float`) — Epsilon for RMSNorm
+    - classifier_dropout (`float` or None) — Dropout for final classifier.
+    - vocab_subspace
+    - vocab_rank
+    ----------------------------------
+    Multi-Head Latent Attention (MLA):
+    ----------------------------------
+    - num_attention_heads (`int`) — Number of attention heads.
+    - q_shared_dim (`int`) — Rank of the shared query subspace.
+    - kv_shared_dim (`int`) — Rank of the shared key/value subspace.
+    - output_subspace (`bool`) — Whether to use a shared latent subspace for output projections.
+    - o_shared_dim (`int`) — Rank of the shared output subspace (required if `output_subspace=True`).
+    - qk_private_dim (`int`) — Query/key private dimension per head.
+    - vo_private_dim (`int`) — Value/output private dimension per head.
+    - rope_dims (`int`) — Number of head dimensions carrying RoPE.
+    - nope_dims (`int`) — Non-positional encoding dimensions.
+    - rope_theta (`float`) — Base frequency used for RoPE.
+    - rope_scaling (`dict` or None) — HF-style scaling dict for RoPE.
+    - attention_bias (`bool`) — Whether to include bias terms in Q/K/V projections.
+    - num_dense_layers (`int`) — Number of leading layers that do not use
+                                 subspaces for attention or FFNs.
+    - attention_backend (`str`) — Must be one of `"eager"`, `"flash_attention_2"`, or `"sdpa"`.
+    ----------------------
+    Decomposed MLP (Low-Rank FFN):
+    ----------------------
+    - ffn_decompose (`bool`) — Whether to enable low-rank FFNs.
+    - ffn_rank (`int`) — Rank of the shared FFN latent space (required if `ffn_decompose=True`).
+    ----------------------
+    Validation Behavior:
+    ----------------------
+    Raises `ValueError` at init time if:
+    - FFN decomposition is enabled without specifying `ffn_rank`.
+    - An unknown `attention_backend` is provided.
+    """
+    model_type = "shared_subspace_decoder"
+    def __init__(
+        self,
+        # === Core Model ===
+        vocab_size:         int = 30522,
+        hidden_size:        int = 512,
+        num_hidden_layers:  int = 12,
+        intermediate_size:  int = 3072,
+        hidden_dropout_prob=0.1,
+        attention_dropout_prob=0.1,
+        max_position_embeddings: int = 2048,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        rms_norm_eps=1e-6, # Their default, but confirm in config.
+        norm_type="layernorm", # Choice between "layernorm" and "rmsnorm"
+        classifier_dropout=None,
+        vocab_subspace=False,
+        vocab_rank=None,
+        tie_word_embeddings=True,
+        # === Multi-Head Latent Attention ===
+        num_attention_heads: int = 16,
+        rope_dims:           int = 16,
+        q_shared_dim:        int = None,
+        kv_shared_dim:       int = None,
+        o_shared_dim=None,  # If None, no output subspace is used
+        # Private head dimensions
+        qk_private_dim:      int = None,  # Query/key private dimension per head
+        vo_private_dim:      int = None,  # Value/output private dimension per head
+        nope_dims:           int = None,  # Non-positional encoding dimensions
+        attention_backend="eager",
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        # === MLA Composition ===
+        num_dense_layers=12,  # dense MHA layers before MLA starts
+        # === Decomposed MLP ===
+        ffn_decompose=False,
+        ffn_rank=None,
+        **kwargs
+    ) -> None:
+        super().__init__(**kwargs)
+        # === Core Model ===
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_dropout_prob = attention_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.rms_norm_eps = rms_norm_eps
+        self.norm_type = norm_type
+        self.classifier_dropout = classifier_dropout
+        self.vocab_subspace = vocab_subspace
+        self.vocab_rank = vocab_rank
+        self.tie_word_embeddings = tie_word_embeddings
+        # === MLA ===
+        self.num_attention_heads = num_attention_heads
+        self.rope_dims = rope_dims
+        self.q_shared_dim = q_shared_dim
+        self.kv_shared_dim = kv_shared_dim
+        self.o_shared_dim = o_shared_dim
+        # Private head dimensions
+        self.qk_private_dim = qk_private_dim
+        self.vo_private_dim = vo_private_dim
+        self.nope_dims = nope_dims
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.num_dense_layers = num_dense_layers
+        # === Decomposed FFN ===
+        self.ffn_decompose = ffn_decompose
+        self.ffn_rank = ffn_rank
+        # === Attention backend ===
+        self.attention_backend = attention_backend
+        # === Validation ===
+        # TODO - Somewhere during training these get instantiated with bad
+        #        values...
+        #self._validate()
+        #print(f"  > SubEnc *Config.init: {make_shorthand(self)}\n")
+    def _validate(self):
+        # === Model ===
+        if self.num_dense_layers > self.num_hidden_layers:
+            raise ValueError("`num_dense_layers` must be <= `num_hidden_layers`")
+        if self.vocab_subspace and self.vocab_rank is None:
+            raise ValueError("`vocab_rank` must be set when `vocab_subspace=True`")
+        # === MLA Validation ===
+        # At least one of q_shared_dim or kv_shared_dim must be set if we have subspace layers
+        if self.num_dense_layers < self.num_hidden_layers and self.q_shared_dim is None and self.kv_shared_dim is None:
+            raise ValueError("At least one of q_shared_dim or kv_shared_dim must be set when there are subspace layers")
+        # Validate that private dimensions are set
+        if self.qk_private_dim is None or self.vo_private_dim is None:
+            raise ValueError("Must set qk_private_dim and vo_private_dim")
+        if self.nope_dims is None:
+            raise ValueError("Must set nope_dims")
+        # === Decomposed FFN ===
+        if self.ffn_decompose and self.ffn_rank is None:
+            raise ValueError("`ffn_rank` must be set when `ffn_decompose=True`")
+        if self.ffn_decompose and self.num_dense_layers >= self.num_hidden_layers:
+            raise ValueError("`ffn_decompose` was set but `num_dense` is >= number of layers")
+        # === Attention Backend ===
+        valid_backends = ["eager", "flash_attention_2", "sdpa"]
+        if self.attention_backend not in valid_backends:
+            raise ValueError(f"Unknown attention backend: {self.attention_backend}, options are {valid_backends}")
+        # === Norm Type ===
+        valid_norm_types = ["layernorm", "rmsnorm"]
+        if self.norm_type not in valid_norm_types:
+            raise ValueError(f"Unknown norm type: {self.norm_type}, options are {valid_norm_types}")
+#### `get_config`
+import json
+def get_config(filename):
+    # Load the config file.
+    with open(filename) as f:
+        full_cfg = json.load(f)
+    # Strict key check on the model configuration.
+    # Get the list of keys allowed / required by `*Config`
+    valid_keys = SharedSpaceDecoderConfig.__init__.__code__.co_varnames
+    # Remove `self` and `kwargs`
+    valid_keys = set(valid_keys) - {"self", "kwargs"}
+    # Compare the set of keys in the json file vs `*Config`
+    extra_keys = set(full_cfg["model"]) - valid_keys
+    missing_keys = valid_keys - set(full_cfg["model"])
+    # If there any in the `json` that aren't in `*Config`,
+    if extra_keys:
+        # List them for the user.
+        raise ValueError(f"Unknown keys in config: {sorted(extra_keys)}")
+    #  If the json config is missing required keys,
+    if missing_keys:
+        # List them for the user.
+        raise ValueError(f"config json is missing: {sorted(missing_keys)}")
+    # Will raise TypeError, by design, if required args are missing
+    # The asterisks unpack the dictionary into a list of keywords as though
+    # all of the settings were writting out individually.
+    model_cfg = SharedSpaceDecoderConfig(**full_cfg["model"])
+    return full_cfg, model_cfg

checkpoint-3000/shared_space_decoder.py ADDED Viewed

	@@ -0,0 +1,386 @@

+# -*- coding: utf-8 -*-
+"""# shared_subspace_encoder.py"""
+from typing import Optional
+import torch
+from torch import nn
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_utils import PreTrainedModel
+from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa
+from .mla import MultiheadLatentAttention, RotaryEmbedding
+from .feedforward import SubspaceFeedForward
+from .shared_space_config import SharedSpaceDecoderConfig
+"""`RMSNorm`
+From:
+https://huggingface.co/deepseek-ai/DeepSeek-R1/blob/main/modeling_deepseek.py
+TODO - May not need?
+"""
+class DeepseekV3RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        DeepseekV3RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+def create_norm_layer(hidden_size: int, config: SharedSpaceDecoderConfig) -> nn.Module:
+    """
+    Create a normalization layer based on the config norm_type.
+    Args:
+        hidden_size: The dimension to normalize over
+        config: Configuration containing norm_type and epsilon values
+    Returns:
+        Either a LayerNorm or RMSNorm layer
+    """
+    if config.norm_type == "layernorm":
+        return nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
+    elif config.norm_type == "rmsnorm":
+        return DeepseekV3RMSNorm(hidden_size, eps=config.rms_norm_eps)
+    else:
+        # This should be caught by config validation, but being defensive
+        raise ValueError(f"Unknown norm_type: {config.norm_type}")
+"""#### *PreTrainedModel"""
+class SharedSpaceDecoderPreTrainedModel(PreTrainedModel):
+    """
+    The **PreTrainedModel object:
+      - Is instantiated when TODO
+      - Initializes:
+        - TODO
+      - Provides access to TODO
+      - Executes TODO
+    """
+    config_class = SharedSpaceDecoderConfig
+    base_model_prefix = "model"
+    def _init_weights(self, module: nn.Module) -> None:
+        """Weight initialization hook used by :class:`PreTrainedModel`.
+        ``PreTrainedModel.post_init`` will recursively apply this function to
+        every submodule right after construction.  HuggingFace models override
+        it so that creating a model from scratch yields the same initialization
+        as ``from_pretrained`` when no checkpoint is supplied.
+        This decoder-specific initialization strategy includes:
+        - Proper handling of configurable normalization layers (LayerNorm or RMSNorm)
+        - Special initialization for language modeling heads
+        - Considerations for causal attention and autoregressive modeling
+        - Support for both dense and decomposed vocabulary embeddings
+        """
+        if isinstance(module, nn.Linear):
+            # Standard linear layer initialization
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            # Initialize embeddings with normal distribution
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, DeepseekV3RMSNorm):
+            # RMSNorm initialization: weight to 1.0, no bias term
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.LayerNorm):
+            # LayerNorm initialization: bias to 0, weight to 1.0
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+"""# ▂▂▂▂▂▂▂▂▂▂▂▂
+# Classes
+"""
+"""#### `*Layer`"""
+class SharedSpaceDecoderLayer(nn.Module):
+    """
+    The **Layer object:
+      - Is instantiated by :class:`SharedSpaceDecoderModel` for each
+        Transformer block in the decoder.
+      - Initializes:
+        - ``self_attn`` – multi-head latent attention implementing either
+          dense or latent projections depending on the configuration.
+        - ``ffn`` – a :class:`SubspaceFeedForward` block.
+        - RMSNorm layers for pre-attention and pre-FFN normalization.
+      - Provides access to the attention and feed-forward submodules via the
+        attributes ``self_attn`` and ``ffn``.
+      - Executes a single decoder block in :meth:`forward`.
+    """
+    def __init__(self, config: SharedSpaceDecoderConfig, layer_idx: int) -> None:
+        super().__init__()
+        # Norm applied prior to attention.
+        self.attn_input_norm = create_norm_layer(config.hidden_size, config)
+        # Attention block
+        self.self_attn = MultiheadLatentAttention(config, layer_idx)
+        # Norm applied prior to FFN
+        self.ffn_input_norm = create_norm_layer(config.hidden_size, config)
+        # Feed-forward network used after attention
+        self.ffn = SubspaceFeedForward(config, layer_idx)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor], # RoPE embeddings
+        attention_mask: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        # ========================
+        #     Self Attention
+        # ========================
+        residual_strm = hidden_states
+        # Normalize the hidden states to create the input to attention.
+        attn_input = self.attn_input_norm(hidden_states)
+        # Evaluate
+        attn_output = self.self_attn(
+            attn_input,
+            position_embeddings,
+            attention_mask,
+        )
+        # Add the attention output (the residual) back to the non-normalized
+        # hidden_states.
+        hidden_states = residual_strm + attn_output
+        # ===========================
+        #     Feed-Forward Network
+        # ===========================
+        residual_strm = hidden_states
+        # Normalize the updated hidden states prior to the FFN
+        ffn_input = self.ffn_input_norm(hidden_states)
+        # Evaluate
+        ffn_output = self.ffn(ffn_input)
+        # Add the output the un-normalized hidden states.
+        hidden_states = residual_strm + ffn_output
+        return hidden_states
+"""#### *Model"""
+class SharedSpaceDecoderModel(SharedSpaceDecoderPreTrainedModel):
+    """
+    The **Model object:
+      - Initializes:
+        - The vocabulary embeddings (and optional decomposition)
+        - Position embeddings (calculated in RotaryEmbedding)
+        - All of the **Layer objects.
+      - Provides interface to vocab embeddings.
+      - Executes the whole decoder model in `forward` with causal attention.
+      This is the base decoder without the language modeling head.
+      Use SubspaceDecoderForCausalLM for language modeling tasks.
+    """
+    def __init__(self, config: SharedSpaceDecoderConfig) -> None:
+        super().__init__(config)
+        # ============================
+        #    Vocabulary Embeddings
+        # ============================
+        # Decomposing the vocabulary (if enabled) defines a shared projection
+        # which constrains the model to store semantic information (and
+        # whatever other static token knowledge) into a limited set of
+        # feature directions.
+        # If we're decomposing the token embeddings,
+        # TODO - Rename to vocab_subspace.
+        if config.vocab_subspace:
+            # Create the embedding table. Vocabulary embeddings are learned
+            # in a lower dimensional latent space.
+            self.vocab_embed = nn.Embedding(
+                config.vocab_size, # Number of tokens
+                config.vocab_rank  # Subspace dimension
+            )
+            # Create a
+            # Selected token latents will be projected up to model size.
+            # vocab_proj has shape [vocab_rank x model_size]
+            self.vocab_proj = nn.Linear(
+                config.vocab_rank,  # Size of latents
+                config.hidden_size, # Model size
+                bias=False
+            )
+        # Otherwise, for a dense vocabulary,
+        else:
+            # Create the dense embedding table in model space.
+            self.vocab_embed = nn.Embedding(
+                config.vocab_size,  # Number of tokens
+                config.hidden_size  # Model size
+            )
+            self.vocab_proj = None
+        # =====================
+        #   RoPE Embeddings
+        # =====================
+        # Pre-computes the table of RoPE embeddings, leaving them in
+        # GPU memory.
+        self.rope = RotaryEmbedding(config)
+        # ===================
+        #    Create Layers
+        # ===================
+        layers = []
+        # For each layer,
+        for i in range(config.num_hidden_layers):
+            # Create a **Layer, providing the config and indicating its number.
+            layers.append(
+                SharedSpaceDecoderLayer(
+                    config,
+                    layer_idx = i
+                )
+            )
+        # Wrap in torch ModuleList
+        self.layers = nn.ModuleList(layers)
+        # Whatever huggingface does behind the scenes...
+        self.post_init()
+    # Agents: Do not define boilerplate helpers, e.g., get/set_input_embeddings
+    def embed(self, input_ids: torch.LongTensor) -> torch.Tensor:
+        """
+        Return token embeddings for input ids.
+        This will perform the up projection to model space if the vocabulary is
+        decomposed.
+        input_ids have shape [batch_size, seq_len]
+        """
+        # If the vocabulary is decomposed,
+        if self.vocab_proj is not None:
+            # Retrieve the latents
+            #  input_ids: [batch_size, seq_len]
+            #          x: [batch_size, seq_len, latent_dim]
+            x = self.vocab_embed(input_ids)
+            #  Project the latents back to model space and return.
+            return(self.vocab_proj(x))
+        # If the vocabulary is dense,
+        else:
+            # Just return the embeddings.
+            return self.vocab_embed(input_ids)
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        Run the full decoder stack with causal attention.
+        Inputs:
+            input_ids       [batch_size, seq_len]
+            attention_mask  [batch_size, seq_len] - 1 for real tokens, 0 for padding
+        Returns:
+            Final decoder layer output   [batch_size, seq_len, model_size]
+        """
+        # Retrieve the token embeddings for this sequence.
+        # These are model_size, regardless of whether the vocab is decompd.
+        hidden_states = self.embed(input_ids)
+        # Retrieve the rotary position embeddings for all of the positions in
+        # our current input sequence.
+        seq_len = hidden_states.size(1)
+        # Retrieves just the ones necessary for the sequence length of the
+        # input. These are vectors, two per token. Their length is the
+        # number of head dimensions we're applying RoPE to.
+        #  Input
+        #     cos: [max_seq_len, rope_dims]
+        #     sin: [max_seq_len, rope_dims]
+        #  Outputs:
+        #     R_cos [seq_len, rope_dims]
+        #     R_sin [seq_len, rope_dims]
+        R_cos = self.rope.cos[:seq_len]
+        R_sin = self.rope.sin[:seq_len]
+        # ===============================
+        #   Attention Mask Conversion
+        # ===============================
+        """
+        use_sdpa_attention_masks = (
+            self.attn_implementation == "sdpa"
+            and self.position_embedding_type == "absolute"
+            and head_mask is None
+            and not output_attentions
+        )
+        """
+        # Expand the attention mask
+        #if use_sdpa_attention_masks and attention_mask.dim() == 2:
+        if True:
+            # Expand the attention mask for SDPA.
+            # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len]
+            extended_attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                attention_mask,
+                hidden_states.dtype,
+                tgt_len = seq_len
+            )
+            attention_mask = extended_attention_mask
+        # Run the model!
+        # For each decoder layer,
+        for layer_i, layer in enumerate(self.layers):
+            # Evaluate the layer
+            hidden_states = layer(
+                hidden_states,       # Token embeddings
+                (R_cos, R_sin),      # Rope embeddings, passed as a tuple.
+                attention_mask,      # Attn mask
+            )
+        # Return the final output of the decoder stack.
+        return hidden_states

checkpoint-3000/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|endoftext|>",
+  "pad_token": "<|endoftext|>",
+  "unk_token": "<|endoftext|>"
+}

checkpoint-3000/task_heads.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional, Union
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from .shared_space_config import SharedSpaceDecoderConfig
+from .shared_space_decoder import (
+    SharedSpaceDecoderPreTrainedModel,
+    SharedSpaceDecoderModel,
+    DeepseekV3RMSNorm
+)
+def create_norm_layer(hidden_size: int, config: SharedSpaceDecoderConfig) -> nn.Module:
+    """
+    Create a normalization layer based on the config norm_type.
+    Args:
+        hidden_size: The dimension to normalize over
+        config: Configuration containing norm_type and epsilon values
+    Returns:
+        Either a LayerNorm or RMSNorm layer
+    """
+    if config.norm_type == "layernorm":
+        return nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
+    elif config.norm_type == "rmsnorm":
+        from .shared_space_decoder import DeepseekV3RMSNorm
+        return DeepseekV3RMSNorm(hidden_size, eps=config.rms_norm_eps)
+    else:
+        # This should be caught by config validation, but being defensive
+        raise ValueError(f"Unknown norm_type: {config.norm_type}")
+class SharedSpaceDecoderForCausalLM(SharedSpaceDecoderPreTrainedModel):
+    """
+    Subspace Decoder model with a causal language modeling head.
+    This model extends the SharedSpaceDecoderModel with:
+    - A language modeling head that projects hidden states to vocabulary logits
+    - Support for computing cross-entropy loss for language modeling
+    - Proper HuggingFace compatibility for causal language modeling tasks
+    - Decoder-specific initialization strategies
+    The model can be used for:
+    - Text generation
+    - Language modeling pretraining
+    - Fine-tuning on downstream tasks
+    """
+    def __init__(self, config: SharedSpaceDecoderConfig) -> None:
+        super().__init__(config)
+        # Initialize the base decoder model
+        self.model = SharedSpaceDecoderModel(config)
+        # Final layer norm before the language modeling head
+        self.norm = create_norm_layer(config.hidden_size, config)
+        # Language modeling head
+        # Projects from hidden_size to vocab_size to get logits for each token
+        self.lm_head = nn.Linear(
+            config.hidden_size,
+            config.vocab_size,
+            bias=False  # Following common practice in modern LMs
+        )
+        # Initialize weights with decoder-specific strategy
+        # Note: tie_weights() will be called automatically by post_init() if config.tie_word_embeddings=True
+        self.post_init()
+    def _init_weights(self, module: nn.Module) -> None:
+        """
+        Decoder-specific weight initialization with special handling for language modeling head.
+        Key differences from encoder initialization:
+        - Language modeling head gets specialized initialization for stability
+        - Configurable normalization layers (LayerNorm or RMSNorm) are properly handled
+        - Weight tying considerations for embedding/lm_head relationship
+        """
+        # Use the base class initialization for most modules
+        super()._init_weights(module)
+        # Special handling for language modeling head
+        if module is self.lm_head:
+            # Use smaller initialization for the language modeling head
+            # This helps with training stability in autoregressive generation
+            # Common practice is to use std=initializer_range or smaller
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            # If weight tying is not used, we might want even smaller init
+            if self.model.vocab_proj is not None:
+                # For vocab subspace models where weights aren't tied,
+                # use a smaller scale to prevent initial logits from being too large
+                module.weight.data.normal_(mean=0.0, std=self.config.initializer_range * 0.5)
+    def get_input_embeddings(self):
+        """Return the input embedding layer for compatibility with HuggingFace."""
+        return self.model.vocab_embed
+    def set_input_embeddings(self, value):
+        """Set the input embedding layer for compatibility with HuggingFace."""
+        self.model.vocab_embed = value
+    def get_output_embeddings(self):
+        """Return the output embedding layer (lm_head) for compatibility."""
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        """Set the output embedding layer for compatibility."""
+        self.lm_head = new_embeddings
+    def tie_weights(self):
+        """
+        Tie the input and output embedding weights.
+        This method sets the language modeling head's weight to be the same as
+        the input embedding weight. This reduces the number of parameters and
+        is a common practice in modern language models.
+        Note: For vocab subspace models, we need to handle the case where
+        input embeddings go through a projection layer.
+        """
+        # Only tie when embeddings live in model space (no vocab_proj)
+        if getattr(self.model, "vocab_proj", None) is None:
+            # Use HF utility for correct tying/cloning semantics
+            self._tie_or_clone_weights(self.lm_head, self.model.vocab_embed)
+        # else: leave untied for subspace case
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Union[CausalLMOutputWithPast, tuple]:
+        """
+        Forward pass for causal language modeling.
+        Args:
+            input_ids: Token ids of shape [batch_size, seq_len]
+            attention_mask: Attention mask of shape [batch_size, seq_len]
+                           (1 for real tokens, 0 for padding)
+            labels: Ground truth token ids for computing loss. Same shape as input_ids.
+                   If provided, loss will be computed. Typically input_ids shifted by 1.
+        Returns:
+            CausalLMOutputWithPast containing:
+            - logits: Prediction logits of shape [batch_size, seq_len, vocab_size]
+            - loss: Cross-entropy loss if labels provided, else None
+            - hidden_states: Final layer hidden states [batch_size, seq_len, hidden_size]
+        """
+        # Run the base decoder model
+        # This applies all the transformer layers with causal attention
+        hidden_states = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            **kwargs
+        )
+        # Apply final layer normalization
+        # This normalizes the final hidden states before the language modeling head
+        hidden_states = self.norm(hidden_states)
+        # Project to vocabulary logits
+        # Shape: [batch_size, seq_len, vocab_size]
+        logits = self.lm_head(hidden_states)
+        # Compute loss if labels are provided
+        # Previously, we had custom loss computation here, but now we use the
+        # standard HuggingFace loss function.
+        loss = None
+        if labels is not None:
+            # Flatten the tokens
+            loss = self.loss_function(
+                logits,
+                labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+        # Return in HuggingFace format
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=None,  # Not implementing KV cache yet
+            #hidden_states=hidden_states,
+            hidden_states=hidden_states if kwargs.get("output_hidden_states", False) else None,
+            attentions=None,
+        )

checkpoint-3000/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-3000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "50256": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 1024,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}

checkpoint-3000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1174 @@

+{
+  "best_global_step": 3000,
+  "best_metric": 3.587533058962724,
+  "best_model_checkpoint": "checkpoints/gpt-2_seq1024_mla0-0-0/checkpoint-3000",
+  "epoch": 0.9042272624519629,
+  "eval_steps": 300,
+  "global_step": 3000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.006028181749679753,
+      "grad_norm": 2.0441031455993652,
+      "learning_rate": 3.166666666666667e-05,
+      "loss": 10.3921,
+      "step": 20
+    },
+    {
+      "epoch": 0.012056363499359506,
+      "grad_norm": 1.7166392803192139,
+      "learning_rate": 6.500000000000001e-05,
+      "loss": 9.241,
+      "step": 40
+    },
+    {
+      "epoch": 0.01808454524903926,
+      "grad_norm": 0.7614754438400269,
+      "learning_rate": 9.833333333333333e-05,
+      "loss": 8.2219,
+      "step": 60
+    },
+    {
+      "epoch": 0.02411272699871901,
+      "grad_norm": 0.46116286516189575,
+      "learning_rate": 0.00013166666666666665,
+      "loss": 7.5368,
+      "step": 80
+    },
+    {
+      "epoch": 0.030140908748398764,
+      "grad_norm": 1.6069883108139038,
+      "learning_rate": 0.000165,
+      "loss": 7.21,
+      "step": 100
+    },
+    {
+      "epoch": 0.03616909049807852,
+      "grad_norm": 1.2077178955078125,
+      "learning_rate": 0.00019833333333333335,
+      "loss": 6.8976,
+      "step": 120
+    },
+    {
+      "epoch": 0.04219727224775827,
+      "grad_norm": 0.642643928527832,
+      "learning_rate": 0.00023166666666666667,
+      "loss": 6.641,
+      "step": 140
+    },
+    {
+      "epoch": 0.04822545399743802,
+      "grad_norm": 0.7632517218589783,
+      "learning_rate": 0.00026500000000000004,
+      "loss": 6.4194,
+      "step": 160
+    },
+    {
+      "epoch": 0.05425363574711778,
+      "grad_norm": 1.2542935609817505,
+      "learning_rate": 0.00029833333333333334,
+      "loss": 6.2428,
+      "step": 180
+    },
+    {
+      "epoch": 0.06028181749679753,
+      "grad_norm": 1.1738799810409546,
+      "learning_rate": 0.0003316666666666667,
+      "loss": 6.1026,
+      "step": 200
+    },
+    {
+      "epoch": 0.06630999924647728,
+      "grad_norm": 0.46843603253364563,
+      "learning_rate": 0.000365,
+      "loss": 5.9586,
+      "step": 220
+    },
+    {
+      "epoch": 0.07233818099615703,
+      "grad_norm": 0.3949953019618988,
+      "learning_rate": 0.00039833333333333333,
+      "loss": 5.8304,
+      "step": 240
+    },
+    {
+      "epoch": 0.07836636274583679,
+      "grad_norm": 1.0866719484329224,
+      "learning_rate": 0.0004316666666666667,
+      "loss": 5.7185,
+      "step": 260
+    },
+    {
+      "epoch": 0.08439454449551655,
+      "grad_norm": 0.34839189052581787,
+      "learning_rate": 0.000465,
+      "loss": 5.6037,
+      "step": 280
+    },
+    {
+      "epoch": 0.09042272624519629,
+      "grad_norm": 0.5791395902633667,
+      "learning_rate": 0.0004983333333333334,
+      "loss": 5.5003,
+      "step": 300
+    },
+    {
+      "epoch": 0.09042272624519629,
+      "eval_loss": 5.428642945231348,
+      "eval_perplexity": 227.839844387096,
+      "eval_runtime": 113.1707,
+      "eval_samples_per_second": 14.659,
+      "eval_steps_per_second": 0.23,
+      "step": 300
+    },
+    {
+      "epoch": 0.09645090799487605,
+      "grad_norm": 0.532134473323822,
+      "learning_rate": 0.0004964814814814814,
+      "loss": 5.3805,
+      "step": 320
+    },
+    {
+      "epoch": 0.1024790897445558,
+      "grad_norm": 0.5876124501228333,
+      "learning_rate": 0.0004927777777777777,
+      "loss": 5.2775,
+      "step": 340
+    },
+    {
+      "epoch": 0.10850727149423556,
+      "grad_norm": 0.4140375256538391,
+      "learning_rate": 0.0004890740740740741,
+      "loss": 5.193,
+      "step": 360
+    },
+    {
+      "epoch": 0.1145354532439153,
+      "grad_norm": 0.5529780387878418,
+      "learning_rate": 0.0004853703703703704,
+      "loss": 5.0912,
+      "step": 380
+    },
+    {
+      "epoch": 0.12056363499359506,
+      "grad_norm": 0.3331276774406433,
+      "learning_rate": 0.0004816666666666667,
+      "loss": 5.0266,
+      "step": 400
+    },
+    {
+      "epoch": 0.1265918167432748,
+      "grad_norm": 0.6916081309318542,
+      "learning_rate": 0.00047796296296296297,
+      "loss": 4.9375,
+      "step": 420
+    },
+    {
+      "epoch": 0.13261999849295456,
+      "grad_norm": 0.31799590587615967,
+      "learning_rate": 0.0004742592592592593,
+      "loss": 4.9026,
+      "step": 440
+    },
+    {
+      "epoch": 0.1386481802426343,
+      "grad_norm": 0.5735257267951965,
+      "learning_rate": 0.00047055555555555555,
+      "loss": 4.8075,
+      "step": 460
+    },
+    {
+      "epoch": 0.14467636199231407,
+      "grad_norm": 0.5078163146972656,
+      "learning_rate": 0.00046685185185185187,
+      "loss": 4.759,
+      "step": 480
+    },
+    {
+      "epoch": 0.15070454374199382,
+      "grad_norm": 0.3884360194206238,
+      "learning_rate": 0.00046314814814814813,
+      "loss": 4.6706,
+      "step": 500
+    },
+    {
+      "epoch": 0.15673272549167358,
+      "grad_norm": 0.5591968297958374,
+      "learning_rate": 0.00045944444444444445,
+      "loss": 4.627,
+      "step": 520
+    },
+    {
+      "epoch": 0.16276090724135334,
+      "grad_norm": 0.29390859603881836,
+      "learning_rate": 0.0004557407407407407,
+      "loss": 4.5691,
+      "step": 540
+    },
+    {
+      "epoch": 0.1687890889910331,
+      "grad_norm": 0.3524036407470703,
+      "learning_rate": 0.00045203703703703703,
+      "loss": 4.5066,
+      "step": 560
+    },
+    {
+      "epoch": 0.17481727074071282,
+      "grad_norm": 0.4607946574687958,
+      "learning_rate": 0.0004483333333333333,
+      "loss": 4.4661,
+      "step": 580
+    },
+    {
+      "epoch": 0.18084545249039258,
+      "grad_norm": 0.2523379623889923,
+      "learning_rate": 0.00044462962962962967,
+      "loss": 4.4331,
+      "step": 600
+    },
+    {
+      "epoch": 0.18084545249039258,
+      "eval_loss": 4.357323692651888,
+      "eval_perplexity": 78.04797429242741,
+      "eval_runtime": 112.5783,
+      "eval_samples_per_second": 14.736,
+      "eval_steps_per_second": 0.231,
+      "step": 600
+    },
+    {
+      "epoch": 0.18687363424007233,
+      "grad_norm": 0.3953287601470947,
+      "learning_rate": 0.00044092592592592594,
+      "loss": 4.3877,
+      "step": 620
+    },
+    {
+      "epoch": 0.1929018159897521,
+      "grad_norm": 0.38012024760246277,
+      "learning_rate": 0.00043722222222222225,
+      "loss": 4.3619,
+      "step": 640
+    },
+    {
+      "epoch": 0.19892999773943185,
+      "grad_norm": 0.3135126233100891,
+      "learning_rate": 0.0004335185185185185,
+      "loss": 4.3442,
+      "step": 660
+    },
+    {
+      "epoch": 0.2049581794891116,
+      "grad_norm": 0.49422895908355713,
+      "learning_rate": 0.00042981481481481484,
+      "loss": 4.3045,
+      "step": 680
+    },
+    {
+      "epoch": 0.21098636123879136,
+      "grad_norm": 0.27019643783569336,
+      "learning_rate": 0.0004261111111111111,
+      "loss": 4.2808,
+      "step": 700
+    },
+    {
+      "epoch": 0.21701454298847112,
+      "grad_norm": 0.3935423791408539,
+      "learning_rate": 0.0004224074074074074,
+      "loss": 4.2467,
+      "step": 720
+    },
+    {
+      "epoch": 0.22304272473815084,
+      "grad_norm": 0.28830042481422424,
+      "learning_rate": 0.0004187037037037037,
+      "loss": 4.2274,
+      "step": 740
+    },
+    {
+      "epoch": 0.2290709064878306,
+      "grad_norm": 0.2877708673477173,
+      "learning_rate": 0.000415,
+      "loss": 4.2075,
+      "step": 760
+    },
+    {
+      "epoch": 0.23509908823751036,
+      "grad_norm": 0.3449930250644684,
+      "learning_rate": 0.00041129629629629627,
+      "loss": 4.1909,
+      "step": 780
+    },
+    {
+      "epoch": 0.2411272699871901,
+      "grad_norm": 0.326265424489975,
+      "learning_rate": 0.00040759259259259264,
+      "loss": 4.1684,
+      "step": 800
+    },
+    {
+      "epoch": 0.24715545173686987,
+      "grad_norm": 0.3821180760860443,
+      "learning_rate": 0.0004038888888888889,
+      "loss": 4.1513,
+      "step": 820
+    },
+    {
+      "epoch": 0.2531836334865496,
+      "grad_norm": 0.2768200635910034,
+      "learning_rate": 0.0004001851851851852,
+      "loss": 4.1317,
+      "step": 840
+    },
+    {
+      "epoch": 0.2592118152362294,
+      "grad_norm": 0.29724735021591187,
+      "learning_rate": 0.0003964814814814815,
+      "loss": 4.1171,
+      "step": 860
+    },
+    {
+      "epoch": 0.2652399969859091,
+      "grad_norm": 0.2814749479293823,
+      "learning_rate": 0.0003927777777777778,
+      "loss": 4.1049,
+      "step": 880
+    },
+    {
+      "epoch": 0.2712681787355889,
+      "grad_norm": 0.26310989260673523,
+      "learning_rate": 0.00038907407407407407,
+      "loss": 4.0858,
+      "step": 900
+    },
+    {
+      "epoch": 0.2712681787355889,
+      "eval_loss": 4.030442206068737,
+      "eval_perplexity": 56.285795665128816,
+      "eval_runtime": 112.6912,
+      "eval_samples_per_second": 14.722,
+      "eval_steps_per_second": 0.231,
+      "step": 900
+    },
+    {
+      "epoch": 0.2772963604852686,
+      "grad_norm": 0.3063352108001709,
+      "learning_rate": 0.0003853703703703704,
+      "loss": 4.0813,
+      "step": 920
+    },
+    {
+      "epoch": 0.2833245422349484,
+      "grad_norm": 0.4063352346420288,
+      "learning_rate": 0.00038166666666666666,
+      "loss": 4.0647,
+      "step": 940
+    },
+    {
+      "epoch": 0.28935272398462814,
+      "grad_norm": 0.2601182758808136,
+      "learning_rate": 0.000377962962962963,
+      "loss": 4.0447,
+      "step": 960
+    },
+    {
+      "epoch": 0.29538090573430786,
+      "grad_norm": 0.3153649568557739,
+      "learning_rate": 0.00037425925925925924,
+      "loss": 4.0417,
+      "step": 980
+    },
+    {
+      "epoch": 0.30140908748398765,
+      "grad_norm": 0.26004716753959656,
+      "learning_rate": 0.0003705555555555556,
+      "loss": 4.0279,
+      "step": 1000
+    },
+    {
+      "epoch": 0.3074372692336674,
+      "grad_norm": 0.43524181842803955,
+      "learning_rate": 0.0003668518518518519,
+      "loss": 4.0152,
+      "step": 1020
+    },
+    {
+      "epoch": 0.31346545098334716,
+      "grad_norm": 0.2601913511753082,
+      "learning_rate": 0.0003631481481481482,
+      "loss": 4.0026,
+      "step": 1040
+    },
+    {
+      "epoch": 0.3194936327330269,
+      "grad_norm": 0.25712424516677856,
+      "learning_rate": 0.00035944444444444446,
+      "loss": 3.9933,
+      "step": 1060
+    },
+    {
+      "epoch": 0.3255218144827067,
+      "grad_norm": 0.3313097357749939,
+      "learning_rate": 0.0003557407407407408,
+      "loss": 3.9842,
+      "step": 1080
+    },
+    {
+      "epoch": 0.3315499962323864,
+      "grad_norm": 0.238124281167984,
+      "learning_rate": 0.00035203703703703704,
+      "loss": 3.9745,
+      "step": 1100
+    },
+    {
+      "epoch": 0.3375781779820662,
+      "grad_norm": 0.28039032220840454,
+      "learning_rate": 0.00034833333333333336,
+      "loss": 3.9588,
+      "step": 1120
+    },
+    {
+      "epoch": 0.3436063597317459,
+      "grad_norm": 0.3661600649356842,
+      "learning_rate": 0.0003446296296296296,
+      "loss": 3.9532,
+      "step": 1140
+    },
+    {
+      "epoch": 0.34963454148142564,
+      "grad_norm": 0.22854666411876678,
+      "learning_rate": 0.0003409259259259259,
+      "loss": 3.9392,
+      "step": 1160
+    },
+    {
+      "epoch": 0.3556627232311054,
+      "grad_norm": 0.2930833697319031,
+      "learning_rate": 0.0003372222222222222,
+      "loss": 3.9312,
+      "step": 1180
+    },
+    {
+      "epoch": 0.36169090498078516,
+      "grad_norm": 0.25500425696372986,
+      "learning_rate": 0.0003335185185185185,
+      "loss": 3.9295,
+      "step": 1200
+    },
+    {
+      "epoch": 0.36169090498078516,
+      "eval_loss": 3.874079793811062,
+      "eval_perplexity": 48.13838066773452,
+      "eval_runtime": 112.5895,
+      "eval_samples_per_second": 14.735,
+      "eval_steps_per_second": 0.231,
+      "step": 1200
+    },
+    {
+      "epoch": 0.36771908673046494,
+      "grad_norm": 0.27924397587776184,
+      "learning_rate": 0.00032981481481481485,
+      "loss": 3.9157,
+      "step": 1220
+    },
+    {
+      "epoch": 0.37374726848014467,
+      "grad_norm": 0.2649654448032379,
+      "learning_rate": 0.0003261111111111111,
+      "loss": 3.9136,
+      "step": 1240
+    },
+    {
+      "epoch": 0.37977545022982445,
+      "grad_norm": 0.27920904755592346,
+      "learning_rate": 0.00032240740740740743,
+      "loss": 3.899,
+      "step": 1260
+    },
+    {
+      "epoch": 0.3858036319795042,
+      "grad_norm": 0.238219752907753,
+      "learning_rate": 0.0003187037037037037,
+      "loss": 3.8965,
+      "step": 1280
+    },
+    {
+      "epoch": 0.3918318137291839,
+      "grad_norm": 0.2615567743778229,
+      "learning_rate": 0.000315,
+      "loss": 3.8877,
+      "step": 1300
+    },
+    {
+      "epoch": 0.3978599954788637,
+      "grad_norm": 0.266513466835022,
+      "learning_rate": 0.0003112962962962963,
+      "loss": 3.8806,
+      "step": 1320
+    },
+    {
+      "epoch": 0.4038881772285434,
+      "grad_norm": 0.3212108314037323,
+      "learning_rate": 0.0003075925925925926,
+      "loss": 3.8774,
+      "step": 1340
+    },
+    {
+      "epoch": 0.4099163589782232,
+      "grad_norm": 0.231920063495636,
+      "learning_rate": 0.00030388888888888886,
+      "loss": 3.8707,
+      "step": 1360
+    },
+    {
+      "epoch": 0.41594454072790293,
+      "grad_norm": 0.27022039890289307,
+      "learning_rate": 0.0003001851851851852,
+      "loss": 3.863,
+      "step": 1380
+    },
+    {
+      "epoch": 0.4219727224775827,
+      "grad_norm": 0.243785560131073,
+      "learning_rate": 0.00029648148148148144,
+      "loss": 3.8562,
+      "step": 1400
+    },
+    {
+      "epoch": 0.42800090422726245,
+      "grad_norm": 0.2512820065021515,
+      "learning_rate": 0.0002927777777777778,
+      "loss": 3.8541,
+      "step": 1420
+    },
+    {
+      "epoch": 0.43402908597694223,
+      "grad_norm": 0.25888124108314514,
+      "learning_rate": 0.0002890740740740741,
+      "loss": 3.8421,
+      "step": 1440
+    },
+    {
+      "epoch": 0.44005726772662196,
+      "grad_norm": 0.29554685950279236,
+      "learning_rate": 0.0002853703703703704,
+      "loss": 3.8468,
+      "step": 1460
+    },
+    {
+      "epoch": 0.4460854494763017,
+      "grad_norm": 0.2608514130115509,
+      "learning_rate": 0.00028166666666666666,
+      "loss": 3.837,
+      "step": 1480
+    },
+    {
+      "epoch": 0.45211363122598147,
+      "grad_norm": 0.2781771123409271,
+      "learning_rate": 0.000277962962962963,
+      "loss": 3.8327,
+      "step": 1500
+    },
+    {
+      "epoch": 0.45211363122598147,
+      "eval_loss": 3.776173532413324,
+      "eval_perplexity": 43.6487014373908,
+      "eval_runtime": 112.7037,
+      "eval_samples_per_second": 14.72,
+      "eval_steps_per_second": 0.231,
+      "step": 1500
+    },
+    {
+      "epoch": 0.4581418129756612,
+      "grad_norm": 0.27767959237098694,
+      "learning_rate": 0.00027425925925925925,
+      "loss": 3.8264,
+      "step": 1520
+    },
+    {
+      "epoch": 0.464169994725341,
+      "grad_norm": 0.24143943190574646,
+      "learning_rate": 0.00027055555555555557,
+      "loss": 3.8224,
+      "step": 1540
+    },
+    {
+      "epoch": 0.4701981764750207,
+      "grad_norm": 0.23281621932983398,
+      "learning_rate": 0.00026685185185185183,
+      "loss": 3.8114,
+      "step": 1560
+    },
+    {
+      "epoch": 0.4762263582247005,
+      "grad_norm": 0.23552870750427246,
+      "learning_rate": 0.00026314814814814815,
+      "loss": 3.8086,
+      "step": 1580
+    },
+    {
+      "epoch": 0.4822545399743802,
+      "grad_norm": 0.25271207094192505,
+      "learning_rate": 0.0002594444444444444,
+      "loss": 3.7954,
+      "step": 1600
+    },
+    {
+      "epoch": 0.48828272172405995,
+      "grad_norm": 0.2609969675540924,
+      "learning_rate": 0.0002557407407407408,
+      "loss": 3.8052,
+      "step": 1620
+    },
+    {
+      "epoch": 0.49431090347373974,
+      "grad_norm": 0.290229469537735,
+      "learning_rate": 0.00025203703703703705,
+      "loss": 3.7946,
+      "step": 1640
+    },
+    {
+      "epoch": 0.5003390852234195,
+      "grad_norm": 0.2184811681509018,
+      "learning_rate": 0.0002483333333333333,
+      "loss": 3.7913,
+      "step": 1660
+    },
+    {
+      "epoch": 0.5063672669730992,
+      "grad_norm": 0.24770255386829376,
+      "learning_rate": 0.00024462962962962963,
+      "loss": 3.7902,
+      "step": 1680
+    },
+    {
+      "epoch": 0.512395448722779,
+      "grad_norm": 0.2306816428899765,
+      "learning_rate": 0.00024092592592592593,
+      "loss": 3.7775,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5184236304724588,
+      "grad_norm": 0.2789316475391388,
+      "learning_rate": 0.00023722222222222222,
+      "loss": 3.7774,
+      "step": 1720
+    },
+    {
+      "epoch": 0.5244518122221385,
+      "grad_norm": 0.2576800286769867,
+      "learning_rate": 0.0002335185185185185,
+      "loss": 3.7773,
+      "step": 1740
+    },
+    {
+      "epoch": 0.5304799939718182,
+      "grad_norm": 0.2184872031211853,
+      "learning_rate": 0.0002298148148148148,
+      "loss": 3.7739,
+      "step": 1760
+    },
+    {
+      "epoch": 0.536508175721498,
+      "grad_norm": 0.27034640312194824,
+      "learning_rate": 0.00022611111111111112,
+      "loss": 3.7632,
+      "step": 1780
+    },
+    {
+      "epoch": 0.5425363574711778,
+      "grad_norm": 0.2661360800266266,
+      "learning_rate": 0.0002224074074074074,
+      "loss": 3.7668,
+      "step": 1800
+    },
+    {
+      "epoch": 0.5425363574711778,
+      "eval_loss": 3.7104278931619175,
+      "eval_perplexity": 40.871291331988935,
+      "eval_runtime": 112.585,
+      "eval_samples_per_second": 14.736,
+      "eval_steps_per_second": 0.231,
+      "step": 1800
+    },
+    {
+      "epoch": 0.5485645392208575,
+      "grad_norm": 0.2934229373931885,
+      "learning_rate": 0.0002187037037037037,
+      "loss": 3.7645,
+      "step": 1820
+    },
+    {
+      "epoch": 0.5545927209705372,
+      "grad_norm": 0.24501343071460724,
+      "learning_rate": 0.000215,
+      "loss": 3.7559,
+      "step": 1840
+    },
+    {
+      "epoch": 0.560620902720217,
+      "grad_norm": 0.21381346881389618,
+      "learning_rate": 0.00021129629629629629,
+      "loss": 3.7482,
+      "step": 1860
+    },
+    {
+      "epoch": 0.5666490844698968,
+      "grad_norm": 0.2350403219461441,
+      "learning_rate": 0.0002075925925925926,
+      "loss": 3.7488,
+      "step": 1880
+    },
+    {
+      "epoch": 0.5726772662195765,
+      "grad_norm": 0.23535749316215515,
+      "learning_rate": 0.0002038888888888889,
+      "loss": 3.7409,
+      "step": 1900
+    },
+    {
+      "epoch": 0.5787054479692563,
+      "grad_norm": 0.22763009369373322,
+      "learning_rate": 0.0002001851851851852,
+      "loss": 3.7431,
+      "step": 1920
+    },
+    {
+      "epoch": 0.584733629718936,
+      "grad_norm": 0.22279077768325806,
+      "learning_rate": 0.00019648148148148148,
+      "loss": 3.7373,
+      "step": 1940
+    },
+    {
+      "epoch": 0.5907618114686157,
+      "grad_norm": 0.19868987798690796,
+      "learning_rate": 0.00019277777777777777,
+      "loss": 3.7385,
+      "step": 1960
+    },
+    {
+      "epoch": 0.5967899932182955,
+      "grad_norm": 0.22577470541000366,
+      "learning_rate": 0.00018907407407407406,
+      "loss": 3.7266,
+      "step": 1980
+    },
+    {
+      "epoch": 0.6028181749679753,
+      "grad_norm": 0.23861481249332428,
+      "learning_rate": 0.00018537037037037038,
+      "loss": 3.7288,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6088463567176551,
+      "grad_norm": 0.2368811070919037,
+      "learning_rate": 0.00018166666666666667,
+      "loss": 3.7251,
+      "step": 2020
+    },
+    {
+      "epoch": 0.6148745384673348,
+      "grad_norm": 0.23746469616889954,
+      "learning_rate": 0.00017796296296296296,
+      "loss": 3.7194,
+      "step": 2040
+    },
+    {
+      "epoch": 0.6209027202170145,
+      "grad_norm": 0.19934986531734467,
+      "learning_rate": 0.00017425925925925926,
+      "loss": 3.7203,
+      "step": 2060
+    },
+    {
+      "epoch": 0.6269309019666943,
+      "grad_norm": 0.22287386655807495,
+      "learning_rate": 0.00017055555555555555,
+      "loss": 3.7182,
+      "step": 2080
+    },
+    {
+      "epoch": 0.632959083716374,
+      "grad_norm": 0.21625646948814392,
+      "learning_rate": 0.00016685185185185187,
+      "loss": 3.7133,
+      "step": 2100
+    },
+    {
+      "epoch": 0.632959083716374,
+      "eval_loss": 3.661569493497066,
+      "eval_perplexity": 38.922383385070816,
+      "eval_runtime": 112.8832,
+      "eval_samples_per_second": 14.697,
+      "eval_steps_per_second": 0.23,
+      "step": 2100
+    },
+    {
+      "epoch": 0.6389872654660538,
+      "grad_norm": 0.2128862887620926,
+      "learning_rate": 0.00016314814814814816,
+      "loss": 3.7123,
+      "step": 2120
+    },
+    {
+      "epoch": 0.6450154472157336,
+      "grad_norm": 0.22579094767570496,
+      "learning_rate": 0.00015944444444444445,
+      "loss": 3.7103,
+      "step": 2140
+    },
+    {
+      "epoch": 0.6510436289654133,
+      "grad_norm": 0.19634632766246796,
+      "learning_rate": 0.00015574074074074074,
+      "loss": 3.7077,
+      "step": 2160
+    },
+    {
+      "epoch": 0.657071810715093,
+      "grad_norm": 0.21508827805519104,
+      "learning_rate": 0.00015203703703703703,
+      "loss": 3.7071,
+      "step": 2180
+    },
+    {
+      "epoch": 0.6630999924647728,
+      "grad_norm": 0.23243261873722076,
+      "learning_rate": 0.00014833333333333335,
+      "loss": 3.7035,
+      "step": 2200
+    },
+    {
+      "epoch": 0.6691281742144526,
+      "grad_norm": 0.22189666330814362,
+      "learning_rate": 0.00014462962962962964,
+      "loss": 3.7011,
+      "step": 2220
+    },
+    {
+      "epoch": 0.6751563559641324,
+      "grad_norm": 0.19845160841941833,
+      "learning_rate": 0.00014092592592592594,
+      "loss": 3.6996,
+      "step": 2240
+    },
+    {
+      "epoch": 0.681184537713812,
+      "grad_norm": 0.1785988211631775,
+      "learning_rate": 0.00013722222222222223,
+      "loss": 3.6866,
+      "step": 2260
+    },
+    {
+      "epoch": 0.6872127194634918,
+      "grad_norm": 0.2001648247241974,
+      "learning_rate": 0.00013351851851851852,
+      "loss": 3.6939,
+      "step": 2280
+    },
+    {
+      "epoch": 0.6932409012131716,
+      "grad_norm": 0.21330291032791138,
+      "learning_rate": 0.00012981481481481484,
+      "loss": 3.6868,
+      "step": 2300
+    },
+    {
+      "epoch": 0.6992690829628513,
+      "grad_norm": 0.23209834098815918,
+      "learning_rate": 0.00012611111111111113,
+      "loss": 3.6803,
+      "step": 2320
+    },
+    {
+      "epoch": 0.7052972647125311,
+      "grad_norm": 0.18792849779129028,
+      "learning_rate": 0.00012240740740740742,
+      "loss": 3.6824,
+      "step": 2340
+    },
+    {
+      "epoch": 0.7113254464622109,
+      "grad_norm": 0.19127227365970612,
+      "learning_rate": 0.0001187037037037037,
+      "loss": 3.6821,
+      "step": 2360
+    },
+    {
+      "epoch": 0.7173536282118906,
+      "grad_norm": 0.21657773852348328,
+      "learning_rate": 0.000115,
+      "loss": 3.6727,
+      "step": 2380
+    },
+    {
+      "epoch": 0.7233818099615703,
+      "grad_norm": 0.1759604662656784,
+      "learning_rate": 0.0001112962962962963,
+      "loss": 3.6809,
+      "step": 2400
+    },
+    {
+      "epoch": 0.7233818099615703,
+      "eval_loss": 3.62554258938625,
+      "eval_perplexity": 37.54508920080861,
+      "eval_runtime": 112.4817,
+      "eval_samples_per_second": 14.749,
+      "eval_steps_per_second": 0.231,
+      "step": 2400
+    },
+    {
+      "epoch": 0.7294099917112501,
+      "grad_norm": 0.20114636421203613,
+      "learning_rate": 0.00010759259259259259,
+      "loss": 3.6772,
+      "step": 2420
+    },
+    {
+      "epoch": 0.7354381734609299,
+      "grad_norm": 0.20904237031936646,
+      "learning_rate": 0.00010388888888888889,
+      "loss": 3.6754,
+      "step": 2440
+    },
+    {
+      "epoch": 0.7414663552106096,
+      "grad_norm": 0.18807630240917206,
+      "learning_rate": 0.00010018518518518518,
+      "loss": 3.6756,
+      "step": 2460
+    },
+    {
+      "epoch": 0.7474945369602893,
+      "grad_norm": 0.17234675586223602,
+      "learning_rate": 9.648148148148149e-05,
+      "loss": 3.6686,
+      "step": 2480
+    },
+    {
+      "epoch": 0.7535227187099691,
+      "grad_norm": 0.18095088005065918,
+      "learning_rate": 9.277777777777778e-05,
+      "loss": 3.6691,
+      "step": 2500
+    },
+    {
+      "epoch": 0.7595509004596489,
+      "grad_norm": 0.19775047898292542,
+      "learning_rate": 8.907407407407407e-05,
+      "loss": 3.6679,
+      "step": 2520
+    },
+    {
+      "epoch": 0.7655790822093286,
+      "grad_norm": 0.1963493674993515,
+      "learning_rate": 8.537037037037038e-05,
+      "loss": 3.668,
+      "step": 2540
+    },
+    {
+      "epoch": 0.7716072639590084,
+      "grad_norm": 0.16776752471923828,
+      "learning_rate": 8.166666666666667e-05,
+      "loss": 3.663,
+      "step": 2560
+    },
+    {
+      "epoch": 0.7776354457086881,
+      "grad_norm": 0.18634021282196045,
+      "learning_rate": 7.796296296296296e-05,
+      "loss": 3.6616,
+      "step": 2580
+    },
+    {
+      "epoch": 0.7836636274583678,
+      "grad_norm": 0.1853896975517273,
+      "learning_rate": 7.425925925925927e-05,
+      "loss": 3.6554,
+      "step": 2600
+    },
+    {
+      "epoch": 0.7896918092080476,
+      "grad_norm": 0.19131463766098022,
+      "learning_rate": 7.055555555555556e-05,
+      "loss": 3.6617,
+      "step": 2620
+    },
+    {
+      "epoch": 0.7957199909577274,
+      "grad_norm": 0.1861460953950882,
+      "learning_rate": 6.685185185185186e-05,
+      "loss": 3.6589,
+      "step": 2640
+    },
+    {
+      "epoch": 0.8017481727074072,
+      "grad_norm": 0.18328429758548737,
+      "learning_rate": 6.314814814814815e-05,
+      "loss": 3.6581,
+      "step": 2660
+    },
+    {
+      "epoch": 0.8077763544570868,
+      "grad_norm": 0.1829613447189331,
+      "learning_rate": 5.9444444444444445e-05,
+      "loss": 3.6508,
+      "step": 2680
+    },
+    {
+      "epoch": 0.8138045362067666,
+      "grad_norm": 0.17789465188980103,
+      "learning_rate": 5.5740740740740744e-05,
+      "loss": 3.6536,
+      "step": 2700
+    },
+    {
+      "epoch": 0.8138045362067666,
+      "eval_loss": 3.600645640400387,
+      "eval_perplexity": 36.62187137207453,
+      "eval_runtime": 112.2753,
+      "eval_samples_per_second": 14.776,
+      "eval_steps_per_second": 0.232,
+      "step": 2700
+    },
+    {
+      "epoch": 0.8198327179564464,
+      "grad_norm": 0.16772860288619995,
+      "learning_rate": 5.2037037037037035e-05,
+      "loss": 3.6515,
+      "step": 2720
+    },
+    {
+      "epoch": 0.8258608997061262,
+      "grad_norm": 0.16776631772518158,
+      "learning_rate": 4.8333333333333334e-05,
+      "loss": 3.6539,
+      "step": 2740
+    },
+    {
+      "epoch": 0.8318890814558059,
+      "grad_norm": 0.1612899899482727,
+      "learning_rate": 4.462962962962963e-05,
+      "loss": 3.6494,
+      "step": 2760
+    },
+    {
+      "epoch": 0.8379172632054857,
+      "grad_norm": 0.16370627284049988,
+      "learning_rate": 4.092592592592593e-05,
+      "loss": 3.648,
+      "step": 2780
+    },
+    {
+      "epoch": 0.8439454449551654,
+      "grad_norm": 0.1611148715019226,
+      "learning_rate": 3.722222222222222e-05,
+      "loss": 3.6498,
+      "step": 2800
+    },
+    {
+      "epoch": 0.8499736267048451,
+      "grad_norm": 0.16145454347133636,
+      "learning_rate": 3.351851851851852e-05,
+      "loss": 3.6518,
+      "step": 2820
+    },
+    {
+      "epoch": 0.8560018084545249,
+      "grad_norm": 0.1517009735107422,
+      "learning_rate": 2.9814814814814815e-05,
+      "loss": 3.644,
+      "step": 2840
+    },
+    {
+      "epoch": 0.8620299902042047,
+      "grad_norm": 0.1570337861776352,
+      "learning_rate": 2.6111111111111114e-05,
+      "loss": 3.6417,
+      "step": 2860
+    },
+    {
+      "epoch": 0.8680581719538845,
+      "grad_norm": 0.16112424433231354,
+      "learning_rate": 2.240740740740741e-05,
+      "loss": 3.6459,
+      "step": 2880
+    },
+    {
+      "epoch": 0.8740863537035641,
+      "grad_norm": 0.15278516709804535,
+      "learning_rate": 1.8703703703703707e-05,
+      "loss": 3.6408,
+      "step": 2900
+    },
+    {
+      "epoch": 0.8801145354532439,
+      "grad_norm": 0.15267758071422577,
+      "learning_rate": 1.5e-05,
+      "loss": 3.6452,
+      "step": 2920
+    },
+    {
+      "epoch": 0.8861427172029237,
+      "grad_norm": 0.14383205771446228,
+      "learning_rate": 1.1296296296296295e-05,
+      "loss": 3.6364,
+      "step": 2940
+    },
+    {
+      "epoch": 0.8921708989526034,
+      "grad_norm": 0.14794571697711945,
+      "learning_rate": 7.592592592592593e-06,
+      "loss": 3.6325,
+      "step": 2960
+    },
+    {
+      "epoch": 0.8981990807022832,
+      "grad_norm": 0.14148849248886108,
+      "learning_rate": 3.888888888888889e-06,
+      "loss": 3.64,
+      "step": 2980
+    },
+    {
+      "epoch": 0.9042272624519629,
+      "grad_norm": 0.1426716446876526,
+      "learning_rate": 1.8518518518518518e-07,
+      "loss": 3.6443,
+      "step": 3000
+    },
+    {
+      "epoch": 0.9042272624519629,
+      "eval_loss": 3.587533058962724,
+      "eval_perplexity": 36.1447987636509,
+      "eval_runtime": 112.2843,
+      "eval_samples_per_second": 14.775,
+      "eval_steps_per_second": 0.232,
+      "step": 3000
+    }
+  ],
+  "logging_steps": 20,
+  "max_steps": 3000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 300,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.604552000274432e+18,
+  "train_batch_size": 64,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-3000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:08d76f8b5b85f05fb7caa15e408143d3ba0686e42d70879850259702897d08d2
+size 5905

checkpoint-3000/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

full_config.json ADDED Viewed

	@@ -0,0 +1,73 @@

+{
+  "shorthand": "model.768.lyr.12 - seqlen.1024 - mla.0.0.0 - ah12.64 - rd32.32",
+  "notes": "GPT-2 run with essentially MHA.",
+  "model": {
+    "hidden_size": 768,
+    "num_hidden_layers": 12,
+    "intermediate_size": 2048,
+    "vocab_size": 50257,
+    "tie_word_embeddings": true,
+    "max_position_embeddings": 1024,
+    "norm_type": "rmsnorm",
+    "layer_norm_eps": 1e-12,
+    "rms_norm_eps": 1e-06,
+    "num_dense_layers": 0,
+    "num_attention_heads": 12,
+    "q_shared_dim": null,
+    "kv_shared_dim": null,
+    "o_shared_dim": null,
+    "qk_private_dim": 64,
+    "vo_private_dim": 64,
+    "rope_dims": 32,
+    "nope_dims": 32,
+    "rope_theta": 10000.0,
+    "rope_scaling": {
+      "type": "linear",
+      "factor": 2.0
+    },
+    "attention_bias": false,
+    "attention_backend": "flash_attention_2",
+    "ffn_decompose": false,
+    "ffn_rank": null,
+    "vocab_subspace": false,
+    "vocab_rank": null,
+    "hidden_dropout_prob": 0.1,
+    "attention_dropout_prob": 0.1,
+    "classifier_dropout": null,
+    "initializer_range": 0.02
+  },
+  "pre_train": {
+    "wandb_project": "decoder-pretrain-c4",
+    "output_dir": "checkpoints/gpt-2_seq1024_mla0-0-0",
+    "seed": 42,
+    "logging_steps": 20,
+    "save_steps": 300,
+    "train_batch_size": 64,
+    "gradient_accumulation_steps": 16,
+    "learning_rate": 0.0005,
+    "num_train_steps": 3000,
+    "eval_steps": 300,
+    "weight_decay": 0.01,
+    "num_workers": 8,
+    "pin_memory": true,
+    "_comment_dataset": "Use preprocessed_dataset_path instead of dataset streaming",
+    "preprocessed_dataset_path": "/home/ubuntu/c4_en_pct0.02_seq1024/c4_en_pct0.02_seq1024/dataset",
+    "dataset_name": "allenai/c4",
+    "dataset_config": "en",
+    "dataset_subset_pct": 0.02,
+    "max_seq_length": 1024,
+    "eval_batch_size": 64,
+    "fp16": false,
+    "bf16": true,
+    "torch_compile": true,
+    "torch_compile_backend": "inductor",
+    "torch_compile_mode": "default",
+    "run_name": "117.88M - model.768.lyr.12 - seqlen.1024 - mla.0.0.0 - ah12.64 - rd32.32",
+    "run_id": "15ixey47",
+    "run_url": "https://wandb.ai/chrismccormick/decoder-pretrain-c4/runs/15ixey47",
+    "best_checkpoint": "checkpoints/gpt-2_seq1024_mla0-0-0/checkpoint-3000"
+  },
+  "stats": {
+    "total_elements": "117.88M"
+  }
+}