Upload folder using huggingface_hub

Browse files

Files changed (11) hide show

.gitattributes +1 -0
config.json +56 -0
configuration_dragon.py +197 -0
generation_config.json +8 -0
model-00001-of-00002.safetensors +3 -0
model-00002-of-00002.safetensors +3 -0
model.safetensors.index.json +627 -0
modeling_dragon.py +1280 -0
special_tokens_map.json +5 -0
tokenizer.json +3 -0
tokenizer_config.json +219 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

config.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "A_init_range": [
+    1,
+    16
+  ],
+  "architectures": [
+    "DragonForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_dragon.DragonConfig",
+    "AutoModel": "modeling_dragon.DragonModel",
+    "AutoModelForCausalLM": "modeling_dragon.DragonForCausalLM"
+  },
+  "bos_token_id": 0,
+  "conv_kernel": 4,
+  "dtype": "bfloat16",
+  "eos_token_id": 0,
+  "eot_token_id": 0,
+  "expand_factor": 2,
+  "hidden_dropout": 0.0,
+  "hidden_size": 2048,
+  "initializer_range": 0.006,
+  "intermediate_size": 8192,
+  "layers_config": "lrlrdlrlrlrlrdlrlrlrlrdlrlrlrlrdlrlr",
+  "max_position_embeddings": 32768,
+  "mlp_bias": false,
+  "mlp_hidden_act": "relu2",
+  "model_type": "dragon",
+  "norm_epsilon": 1e-06,
+  "num_attention_heads": 32,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 16,
+  "num_logits_to_keep": 1,
+  "old_lns": true,
+  "pad_token_id": 0,
+  "qk_norm": true,
+  "residual_in_fp32": false,
+  "rope_theta": 163,
+  "scalable_softmax": true,
+  "sliding_window_size": 1024,
+  "slw_wsize": 32768,
+  "softcap_global_attn": 150.0,
+  "softcap_local_attn": 0.0,
+  "tie_word_embeddings": false,
+  "time_step_floor": 0.0001,
+  "time_step_max": 0.1,
+  "time_step_min": 0.001,
+  "transformers_version": "4.56.1",
+  "uscaling_tau": 0.2,
+  "use_bias": false,
+  "use_cache": true,
+  "use_uscaling": false,
+  "vocab_size": 196736
+}

configuration_dragon.py ADDED Viewed

	@@ -0,0 +1,197 @@

+# coding=utf-8
+"""Dragon model configuration"""
+import re
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+#@register_for_auto_class("AutoConfig")
+class DragonConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DragonModel`]. It is used to instantiate a
+    Dragon model according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151936):
+            Vocabulary size of the Dragon model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`DragonModel`]
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
+            model has a output word embedding layer.
+        hidden_size (`int`, *optional*, defaults to 2048):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 8192):
+            Dimension of the MLP representations.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used.
+        mlp_hidden_act (`str`, *optional*, defaults to "relu2"):
+            The non-linear activation function in the MLP layers.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in attention layers.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in MLP layers.
+        use_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in the model.
+        initializer_range (`float`, *optional*, defaults to 0.006):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        norm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        residual_in_fp32 (`bool`, *optional*, defaults to `False`):
+            Whether or not residuals should be in `float32`. If set to `False` residuals will keep the same `dtype` as the rest of the model.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
+            Number of prompt logits to calculate during generation. If `None`, all logits will be calculated. If an
+            integer value, only last `num_logits_to_keep` logits will be calculated.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        sliding_window_size (`int`, *optional*, defaults to 1024):
+            Sliding window attention window size.
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model might ever be used with.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        hidden_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the hidden states.
+        use_mamba_kernels (`bool`, *optional*, defaults to `True`):
+            Flag indicating whether or not to use the fast mamba kernels. These are available only if `mamba-ssm` and
+            `causal-conv1d` are installed, and the mamba modules are running on a CUDA device.
+        mamba_d_conv (`int`, *optional*, defaults to 4):
+            The size of the mamba convolution kernel.
+        mamba_expand (`int`, *optional*, defaults to 2):
+            Expanding factor used to determine the mamba intermediate size.
+        mamba_hidden_act (`str`, *optional*, defaults to "silu"):
+            The non-linear activation function in the Mamba layers.
+        mamba_dt_min (`float`, *optional*, defaults to 0.001):
+            Minimum value for the time step in Mamba.
+        mamba_dt_max (`float`, *optional*, defaults to 0.1):
+            Maximum value for the time step in Mamba.
+        mamba_dt_limit (`tuple`, *optional*, defaults to (0.0, float("inf"))):
+            Limits for the time step in Mamba.
+        mamba_dt_init_floor (`float`, *optional*, defaults to 1e-4):
+            Floor value for time step initialization in Mamba.
+    """
+    model_type = "dragon"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=151936,
+        tie_word_embeddings=False,
+        max_position_embeddings=8192,
+        use_uscaling=True,
+        hidden_size=2048,
+        intermediate_size=8192,
+        expand_factor=2,
+        layers_config=4*"lrdlr",
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        mlp_hidden_act="relu2",
+        attention_bias=False,
+        mlp_bias=False,
+        use_bias=False,
+        initializer_range=0.006,
+        softcap_local_attn=0.0,
+        softcap_global_attn=150.0,
+        norm_epsilon=1e-6,
+        residual_in_fp32=False,
+        use_cache=True,
+        num_logits_to_keep=1,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        sliding_window_size=1024,
+        slw_wsize=-1,
+        rope_theta_local=163.,
+        uscaling_tau=0.2,
+        attention_dropout=0.,
+        hidden_dropout=0.,
+        gdn_d_conv=4,
+        gdn_dt_min=0.001,
+        gdn_dt_max=0.1,
+        gdn_dt_init_floor=1e-4,
+        gdn_A_init_range=(1, 16),
+        old_lns=False,
+        **kwargs,
+    ):
+        self.rope_theta = rope_theta_local
+        self.qk_norm = True
+        self.softcap_local_attn=softcap_local_attn
+        self.softcap_global_attn=softcap_global_attn
+        self.use_uscaling = use_uscaling
+        self.uscaling_tau = uscaling_tau
+        self.scalable_softmax = True
+        self.vocab_size = vocab_size
+        self.tie_word_embeddings = tie_word_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.expand_factor = expand_factor
+        self.layers_config = layers_config
+        self.num_hidden_layers = len(layers_config)
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window_size = sliding_window_size
+        self.slw_wsize = slw_wsize
+        self.attention_dropout = attention_dropout
+        self.hidden_dropout = hidden_dropout
+        self.max_position_embeddings = max_position_embeddings
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.mlp_hidden_act = mlp_hidden_act
+        self.attention_bias = attention_bias
+        self.mlp_bias = mlp_bias
+        self.use_bias = use_bias
+        self.initializer_range = initializer_range
+        self.norm_epsilon = norm_epsilon
+        self.residual_in_fp32 = residual_in_fp32
+        self.use_cache = use_cache
+        self.num_logits_to_keep = num_logits_to_keep
+        self.conv_kernel = gdn_d_conv
+        self.time_step_min = gdn_dt_min
+        self.time_step_max = gdn_dt_max
+        self.time_step_floor = gdn_dt_init_floor
+        self.A_init_range = gdn_A_init_range
+        self.old_lns = old_lns
+        assert self.hidden_size % self.num_attention_heads == 0
+        assert self.num_attention_heads % self.num_key_value_heads == 0
+        assert self.num_attention_heads % 2 == 0, "Number of attention heads must be even for differential attention."
+        assert self.num_key_value_heads % 2 == 0, "Number of kv heads must be even for differential attention."
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        # TODO: better way to handle those?
+        self.auto_map = dict(getattr(self, "auto_map", {}))
+        self.auto_map.setdefault("AutoConfig", "configuration_dragon.DragonConfig")
+        self.auto_map.setdefault("AutoModel", "modeling_dragon.DragonModel")
+        self.auto_map.setdefault("AutoModelForCausalLM", "modeling_dragon.DragonForCausalLM")
+DragonConfig.register_for_auto_class("AutoConfig")
+__all__ = ["DragonConfig"]
+# todo : update docstrings

generation_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "eos_token_id": 0,
+  "eot_token_id": 0,
+  "pad_token_id": 0,
+  "transformers_version": "4.56.1"
+}

model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:26dc76a87683ef4caf3539a44127b7f2545a4dd900827e42d7ea490fc960a44a
+size 4992581544

model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5750f90d15cba4f16a0f73c7ef1e827c50f1266dcb990d1227653efcf123a30f
+size 2403217768

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,627 @@

+{
+  "metadata": {
+    "total_parameters": 3697865088,
+    "total_size": 7395730176
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00002-of-00002.safetensors",
+    "model.embedding.weight": "model-00001-of-00002.safetensors",
+    "model.final_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.0.attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.attn.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.input_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.lin_attn.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.0.lin_attn.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.0.lin_attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.lin_attn.in_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.lin_attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.lin_attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.lin_attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.lin_attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mixer_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.fc_1.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.fc_2.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.postmixer_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.attn.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.input_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.lin_attn.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.1.lin_attn.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.1.lin_attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.lin_attn.in_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.lin_attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.lin_attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.lin_attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.lin_attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mixer_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.fc_1.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.fc_2.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.postmixer_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.attn.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.input_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.lin_attn.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.10.lin_attn.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.10.lin_attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.lin_attn.in_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.lin_attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.lin_attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.lin_attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.lin_attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mixer_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.fc_1.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.fc_2.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.postmixer_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.attn.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.input_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.lin_attn.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.11.lin_attn.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.11.lin_attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.lin_attn.in_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.lin_attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.lin_attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.lin_attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.lin_attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mixer_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.fc_1.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.fc_2.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.postmixer_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.attn.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.input_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.lin_attn.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.12.lin_attn.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.12.lin_attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.lin_attn.in_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.lin_attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.lin_attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.lin_attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.lin_attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.mixer_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.fc_1.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.fc_2.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.postmixer_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.attn.lambda_k1": "model-00001-of-00002.safetensors",
+    "model.layers.13.attn.lambda_k2": "model-00001-of-00002.safetensors",
+    "model.layers.13.attn.lambda_q1": "model-00001-of-00002.safetensors",
+    "model.layers.13.attn.lambda_q2": "model-00001-of-00002.safetensors",
+    "model.layers.13.attn.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.attn.softmax_scaler": "model-00001-of-00002.safetensors",
+    "model.layers.13.attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.input_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.lin_attn.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.13.lin_attn.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.13.lin_attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.lin_attn.in_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.lin_attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.lin_attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.lin_attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.lin_attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mixer_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.fc_1.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.fc_2.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.postmixer_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.attn.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.input_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.lin_attn.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.14.lin_attn.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.14.lin_attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.lin_attn.in_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.lin_attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.lin_attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.lin_attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.lin_attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mixer_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.fc_1.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.fc_2.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.postmixer_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.attn.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.input_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.lin_attn.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.15.lin_attn.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.15.lin_attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.lin_attn.in_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.lin_attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.lin_attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.lin_attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.lin_attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mixer_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.fc_1.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.fc_2.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.postmixer_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.attn.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.input_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.lin_attn.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.16.lin_attn.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.16.lin_attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.lin_attn.in_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.lin_attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.lin_attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.lin_attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.lin_attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mixer_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.fc_1.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.fc_2.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.postmixer_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.attn.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.input_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.lin_attn.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.17.lin_attn.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.17.lin_attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.lin_attn.in_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.lin_attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.lin_attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.lin_attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.lin_attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.mixer_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.fc_1.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.fc_2.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.postmixer_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.attn.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.input_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.lin_attn.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.18.lin_attn.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.18.lin_attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.lin_attn.in_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.lin_attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.lin_attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.lin_attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.lin_attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.mixer_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.fc_1.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.fc_2.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.postmixer_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.attn.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.input_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.lin_attn.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.19.lin_attn.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.19.lin_attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.lin_attn.in_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.lin_attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.lin_attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.lin_attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.lin_attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.mixer_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.fc_1.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.fc_2.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.postmixer_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.attn.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.input_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.lin_attn.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.2.lin_attn.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.2.lin_attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.lin_attn.in_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.lin_attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.lin_attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.lin_attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.lin_attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mixer_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.fc_1.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.fc_2.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.postmixer_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.attn.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.input_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.lin_attn.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.20.lin_attn.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.20.lin_attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.lin_attn.in_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.lin_attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.lin_attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.lin_attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.lin_attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.mixer_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.mlp.fc_1.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.mlp.fc_2.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.postmixer_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.attn.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.input_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.lin_attn.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.21.lin_attn.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.21.lin_attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.lin_attn.in_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.lin_attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.lin_attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.lin_attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.lin_attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.mixer_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.mlp.fc_1.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.mlp.fc_2.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.postmixer_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.attn.lambda_k1": "model-00001-of-00002.safetensors",
+    "model.layers.22.attn.lambda_k2": "model-00001-of-00002.safetensors",
+    "model.layers.22.attn.lambda_q1": "model-00001-of-00002.safetensors",
+    "model.layers.22.attn.lambda_q2": "model-00001-of-00002.safetensors",
+    "model.layers.22.attn.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.attn.softmax_scaler": "model-00001-of-00002.safetensors",
+    "model.layers.22.attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.input_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.lin_attn.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.22.lin_attn.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.22.lin_attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.lin_attn.in_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.lin_attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.lin_attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.lin_attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.lin_attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.mixer_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.mlp.fc_1.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.mlp.fc_2.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.postmixer_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.attn.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.input_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.lin_attn.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.23.lin_attn.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.23.lin_attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.lin_attn.in_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.lin_attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.lin_attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.lin_attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.lin_attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.mixer_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.mlp.fc_1.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.mlp.fc_2.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.postmixer_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.attn.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.input_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.lin_attn.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.24.lin_attn.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.24.lin_attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.lin_attn.in_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.lin_attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.lin_attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.lin_attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.lin_attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.mixer_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.mlp.fc_1.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.mlp.fc_2.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.postmixer_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.attn.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.input_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.lin_attn.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.25.lin_attn.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.25.lin_attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.lin_attn.in_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.lin_attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.lin_attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.lin_attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.lin_attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.mixer_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.mlp.fc_1.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.mlp.fc_2.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.postmixer_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.attn.linear_qkv.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.attn_group_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.input_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.lin_attn.A_log": "model-00002-of-00002.safetensors",
+    "model.layers.26.lin_attn.dt_bias": "model-00002-of-00002.safetensors",
+    "model.layers.26.lin_attn.g_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.lin_attn.in_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.lin_attn.k_conv1d.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.lin_attn.q_conv1d.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.lin_attn.v_conv1d.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.lin_attn_group_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.mixer_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.mlp.fc_1.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.mlp.fc_2.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.postmixer_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.attn.linear_qkv.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.attn_group_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.input_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.lin_attn.A_log": "model-00002-of-00002.safetensors",
+    "model.layers.27.lin_attn.dt_bias": "model-00002-of-00002.safetensors",
+    "model.layers.27.lin_attn.g_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.lin_attn.in_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.lin_attn.k_conv1d.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.lin_attn.q_conv1d.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.lin_attn.v_conv1d.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.lin_attn_group_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.mixer_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.mlp.fc_1.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.mlp.fc_2.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.postmixer_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.attn.linear_qkv.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.attn_group_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.input_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.lin_attn.A_log": "model-00002-of-00002.safetensors",
+    "model.layers.28.lin_attn.dt_bias": "model-00002-of-00002.safetensors",
+    "model.layers.28.lin_attn.g_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.lin_attn.in_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.lin_attn.k_conv1d.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.lin_attn.q_conv1d.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.lin_attn.v_conv1d.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.lin_attn_group_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.mixer_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.mlp.fc_1.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.mlp.fc_2.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.postmixer_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.attn.linear_qkv.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.attn_group_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.input_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.lin_attn.A_log": "model-00002-of-00002.safetensors",
+    "model.layers.29.lin_attn.dt_bias": "model-00002-of-00002.safetensors",
+    "model.layers.29.lin_attn.g_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.lin_attn.in_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.lin_attn.k_conv1d.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.lin_attn.q_conv1d.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.lin_attn.v_conv1d.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.lin_attn_group_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.mixer_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.mlp.fc_1.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.mlp.fc_2.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.postmixer_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.3.attn.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.input_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.lin_attn.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.3.lin_attn.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.3.lin_attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.lin_attn.in_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.lin_attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.lin_attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.lin_attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.lin_attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mixer_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.fc_1.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.fc_2.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.postmixer_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.30.attn.linear_qkv.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.attn_group_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.input_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.lin_attn.A_log": "model-00002-of-00002.safetensors",
+    "model.layers.30.lin_attn.dt_bias": "model-00002-of-00002.safetensors",
+    "model.layers.30.lin_attn.g_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.lin_attn.in_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.lin_attn.k_conv1d.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.lin_attn.q_conv1d.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.lin_attn.v_conv1d.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.lin_attn_group_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.mixer_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.mlp.fc_1.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.mlp.fc_2.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.postmixer_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.attn.lambda_k1": "model-00002-of-00002.safetensors",
+    "model.layers.31.attn.lambda_k2": "model-00002-of-00002.safetensors",
+    "model.layers.31.attn.lambda_q1": "model-00002-of-00002.safetensors",
+    "model.layers.31.attn.lambda_q2": "model-00002-of-00002.safetensors",
+    "model.layers.31.attn.linear_qkv.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.attn.softmax_scaler": "model-00002-of-00002.safetensors",
+    "model.layers.31.attn_group_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.input_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.lin_attn.A_log": "model-00002-of-00002.safetensors",
+    "model.layers.31.lin_attn.dt_bias": "model-00002-of-00002.safetensors",
+    "model.layers.31.lin_attn.g_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.lin_attn.in_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.lin_attn.k_conv1d.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.lin_attn.q_conv1d.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.lin_attn.v_conv1d.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.lin_attn_group_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.mixer_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.mlp.fc_1.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.mlp.fc_2.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.postmixer_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.attn.linear_qkv.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.attn_group_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.input_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.lin_attn.A_log": "model-00002-of-00002.safetensors",
+    "model.layers.32.lin_attn.dt_bias": "model-00002-of-00002.safetensors",
+    "model.layers.32.lin_attn.g_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.lin_attn.in_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.lin_attn.k_conv1d.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.lin_attn.q_conv1d.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.lin_attn.v_conv1d.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.lin_attn_group_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.mixer_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.mlp.fc_1.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.mlp.fc_2.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.postmixer_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.attn.linear_qkv.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.attn_group_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.input_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.lin_attn.A_log": "model-00002-of-00002.safetensors",
+    "model.layers.33.lin_attn.dt_bias": "model-00002-of-00002.safetensors",
+    "model.layers.33.lin_attn.g_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.lin_attn.in_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.lin_attn.k_conv1d.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.lin_attn.q_conv1d.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.lin_attn.v_conv1d.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.lin_attn_group_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.mixer_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.mlp.fc_1.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.mlp.fc_2.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.postmixer_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.attn.linear_qkv.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.attn_group_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.input_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.lin_attn.A_log": "model-00002-of-00002.safetensors",
+    "model.layers.34.lin_attn.dt_bias": "model-00002-of-00002.safetensors",
+    "model.layers.34.lin_attn.g_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.lin_attn.in_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.lin_attn.k_conv1d.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.lin_attn.q_conv1d.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.lin_attn.v_conv1d.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.lin_attn_group_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.mixer_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.mlp.fc_1.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.mlp.fc_2.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.postmixer_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.attn.linear_qkv.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.attn_group_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.input_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.lin_attn.A_log": "model-00002-of-00002.safetensors",
+    "model.layers.35.lin_attn.dt_bias": "model-00002-of-00002.safetensors",
+    "model.layers.35.lin_attn.g_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.lin_attn.in_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.lin_attn.k_conv1d.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.lin_attn.q_conv1d.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.lin_attn.v_conv1d.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.lin_attn_group_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.mixer_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.mlp.fc_1.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.mlp.fc_2.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.postmixer_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.4.attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.attn.lambda_k1": "model-00001-of-00002.safetensors",
+    "model.layers.4.attn.lambda_k2": "model-00001-of-00002.safetensors",
+    "model.layers.4.attn.lambda_q1": "model-00001-of-00002.safetensors",
+    "model.layers.4.attn.lambda_q2": "model-00001-of-00002.safetensors",
+    "model.layers.4.attn.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.attn.softmax_scaler": "model-00001-of-00002.safetensors",
+    "model.layers.4.attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.input_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.lin_attn.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.4.lin_attn.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.4.lin_attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.lin_attn.in_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.lin_attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.lin_attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.lin_attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.lin_attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mixer_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.fc_1.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.fc_2.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.postmixer_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.attn.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.input_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.lin_attn.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.5.lin_attn.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.5.lin_attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.lin_attn.in_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.lin_attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.lin_attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.lin_attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.lin_attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mixer_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.fc_1.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.fc_2.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.postmixer_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.attn.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.input_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.lin_attn.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.6.lin_attn.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.6.lin_attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.lin_attn.in_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.lin_attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.lin_attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.lin_attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.lin_attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mixer_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.fc_1.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.fc_2.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.postmixer_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.attn.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.input_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.lin_attn.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.7.lin_attn.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.7.lin_attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.lin_attn.in_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.lin_attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.lin_attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.lin_attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.lin_attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mixer_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.fc_1.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.fc_2.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.postmixer_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.attn.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.input_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.lin_attn.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.8.lin_attn.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.8.lin_attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.lin_attn.in_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.lin_attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.lin_attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.lin_attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.lin_attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mixer_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.fc_1.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.fc_2.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.postmixer_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.attn.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.input_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.lin_attn.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.9.lin_attn.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.9.lin_attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.lin_attn.in_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.lin_attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.lin_attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.lin_attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.lin_attn_group_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mixer_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.fc_1.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.fc_2.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.postmixer_norm.weight": "model-00001-of-00002.safetensors"
+  }
+}

modeling_dragon.py ADDED Viewed

	@@ -0,0 +1,1280 @@

+# coding=utf-8
+"""PyTorch Dragon model."""
+from typing import Any, Dict, Optional, Tuple, Union
+from dataclasses import dataclass
+import inspect
+import math
+from einops import rearrange
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers.modeling_utils import PreTrainedModel
+from transformers.modeling_layers import GradientCheckpointingLayer
+from transformers.cache_utils import DynamicCache
+from transformers.generation import GenerationMixin
+from transformers.utils import ModelOutput, logging
+from .configuration_dragon import DragonConfig
+logger = logging.get_logger(__name__)
+ATTN_IMPL = "eager"
+try:
+    from flash_attn import flash_attn_func # FA2
+    ATTN_IMPL = "fa2"
+except ImportError:
+    try:
+        import flash_attn_interface # FA3
+        flash_attn_func = flash_attn_interface.flash_attn_func
+        _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
+        if not _flash_supports_window_size:
+            raise ImportError("flash_attn_func does not support window_size parameter. Please update to more recent flash_attn version")
+        ATTN_IMPL = "fa3"
+    except ImportError:
+        logger.warning_once(
+            "Flash attention is not installed, using eager attention implementation. "
+            "For better performance, consider installing flash_attn."
+        )
+print(f"Using attention implementation: {ATTN_IMPL}")
+DIFF_ATTN_IMPL = None
+try:
+    import flex_head_fa
+    DIFF_ATTN_IMPL = "flex_head"
+except ImportError:
+    DIFF_ATTN_IMPL = ATTN_IMPL # if we don't have flex_head_fa, fallback to the best attention impl we have
+print(f"Using differential attention implementation: {DIFF_ATTN_IMPL}")
+# Gated DeltaNet
+try:
+    from fla.ops.gated_delta_rule import chunk_gated_delta_rule, fused_recurrent_gated_delta_rule
+except ImportError:
+    logger.warning_once("Falling back to Torch implementation for Gated DeltaNet as flash-linear-attention module was not found.")
+    chunk_gated_delta_rule, fused_recurrent_gated_delta_rule = None, None
+# 1D short convolution
+try:
+    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+except ImportError:
+    logger.warning_once("Falling back to Torch implementation for the short convolution as causal-conv1d module was not found.")
+    causal_conv1d_fn, causal_conv1d_update = None, None
+class DragonHeadWiseRMSNorm(nn.Module):
+    def __init__(self, n_heads, d_head, eps=1e-6):
+        super().__init__()
+        self.rms = nn.RMSNorm(d_head, eps=eps, elementwise_affine=False)
+        self.weight = nn.Parameter(torch.ones(n_heads, d_head))
+    def forward(self, hidden_states):
+        B, L, H, D = hidden_states.shape
+        y = self.rms(hidden_states) * self.weight.view(1, 1, H, D)
+        return y.view(B, L, H, D)
+class DragonRMSNorm(nn.RMSNorm):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        DragonRMSNorm is equivalent to RMSNorm
+        """
+        super().__init__(normalized_shape=hidden_size, eps=eps)
+class _ScaleFB(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, alpha_fwd: torch.Tensor, alpha_bwd: torch.Tensor):
+        ctx.save_for_backward(alpha_bwd)
+        return x * alpha_fwd
+    @staticmethod
+    def backward(ctx, grad_output):
+        (alpha_bwd,) = ctx.saved_tensors
+        return grad_output * alpha_bwd, None, None
+class _ScaledLinearFB(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, weight, bias, alpha_fwd, alpha_bwd_x, alpha_bwd_w):
+        ctx.save_for_backward(x, weight, bias)
+        ctx.alpha_bwd_x = alpha_bwd_x
+        ctx.alpha_bwd_w = alpha_bwd_w
+        return F.linear(x, weight, bias) * alpha_fwd
+    @staticmethod
+    def backward(ctx, grad_out):
+        x, weight, bias = ctx.saved_tensors
+        # -------- grads ----------
+        grad_x = torch.matmul(grad_out * ctx.alpha_bwd_x, weight)
+        go_flat = (grad_out * ctx.alpha_bwd_w).reshape(-1, grad_out.shape[-1])
+        x_flat  = x.reshape(-1, x.shape[-1])
+        grad_weight = go_flat.t() @ x_flat
+        grad_bias   = go_flat.sum(0) if bias is not None else None
+        return grad_x, grad_weight, grad_bias, None, None, None
+class DragonLinear(nn.Linear):
+    """Linear layer with different forward/backward scalings."""
+    def __init__(self, config: DragonConfig, in_features, out_features, bias=False, alpha_fwd=None, alpha_bwd=None):
+        super().__init__(in_features, out_features, bias)
+        if alpha_fwd is None:
+            alpha_fwd = 1.0 / math.sqrt(in_features)
+        if not config.use_uscaling:
+            alpha_fwd, alpha_bwd = 1, 1
+        self.register_buffer("alpha_fwd", torch.tensor(float(alpha_fwd)), persistent=False)
+        self.register_buffer("alpha_bwd", torch.tensor(float(alpha_bwd if alpha_bwd is not None else alpha_fwd)), persistent=False)
+    def forward(self, x):
+        return _ScaledLinearFB.apply(x, self.weight, self.bias, self.alpha_fwd, self.alpha_bwd, self.alpha_bwd)
+# heavily adapted from flash-linear-attention
+def prepare_lens(cu_seqlens: torch.LongTensor) -> torch.LongTensor:
+    return cu_seqlens[1:] - cu_seqlens[:-1]
+def prepare_position_ids(cu_seqlens: torch.LongTensor) -> torch.LongTensor:
+    return torch.cat([
+        torch.arange(n, dtype=cu_seqlens.dtype, device=cu_seqlens.device)
+        for n in prepare_lens(cu_seqlens).unbind()
+    ])
+def prepare_sequence_ids(cu_seqlens: torch.LongTensor) -> torch.LongTensor:
+    return prepare_position_ids(cu_seqlens).eq(0).cumsum(0) - 1
+class DragonConv1D(nn.Conv1d):
+    """Wrapper around nn.Conv1d (for definition) and causal_conv1d (for forward)"""
+    def __init__(
+        self,
+        hidden_size: int,
+        kernel_size: int,
+        bias: bool = False,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        super().__init__(
+            in_channels=hidden_size,
+            out_channels=hidden_size,
+            kernel_size=kernel_size,
+            groups=hidden_size,
+            bias=bias,
+            padding=kernel_size - 1,
+            device=device,
+            dtype=dtype,
+        )
+        self.hidden_size = hidden_size
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        cache: Optional[torch.Tensor] = None,
+        output_final_state: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            x (`torch.Tensor`):
+                Tensor of shape `[B, T, D]`.
+                If `seq_idx` is provided, `B` must be 1.
+            mask (`Optional[torch.Tensor]`):
+                Attention mask dealing with padded positions.
+            cache (`Optional[torch.Tensor]`):
+                Previous cache tensor of shape `[N, D, W]`, where `W` is the kernel size.
+                If provided, the cache is updated **inplace**.
+            output_final_state (Optional[bool]):
+                Whether to output the final state of shape `[N, D, W]`. Default: `False`.
+        Returns:
+            Tensor of shape `[B, T, D]`.
+        """
+        B, T, D, W = *x.shape, self.kernel_size[0]
+        N = B
+        if mask is not None:
+            x = x.mul_(mask.unsqueeze(-1))
+        if output_final_state and cache is None:
+            cache = x.new_zeros(N, D, W)
+        # during the decoding phase, we assume the batch is composed of sequences of length 1
+        if cache is not None and T == 1:
+            return self.step(x, cache)
+        if cache is not None:
+            cache[:, :, -min(W, T):].copy_(rearrange(x[..., -min(W, T):, :], 'n w d -> n d w'))
+        x = rearrange(x, 'b t d -> b d t')
+        if causal_conv1d_fn is not None:
+            # Sequence index for each token. Used for varlen.
+            # Suppose a batch consists of two sequences with lengths 3 and 4,
+            # seq_idx=[0, 0, 0, 1, 1, 1, 1] for this batch.
+            # NOTE: No need to provide this arg if `cu_seqlens` is passed.
+            # This arg is just for BC, and will be removed in the future.
+            # [B, T]
+            seq_idx = kwargs.get('seq_idx', None)
+            x = causal_conv1d_fn(
+                x=x.contiguous(),
+                weight=rearrange(self.weight, "d 1 w -> d w"),
+                bias=self.bias,
+                activation="silu",
+                seq_idx=seq_idx,
+            )
+        else:
+            x = self._conv_forward(x, self.weight, self.bias)[..., :x.shape[-1]]
+            x = F.silu(x)
+        return rearrange(x, "b d t -> b t d"), cache
+    def step(
+        self,
+        x: torch.Tensor,
+        cache: torch.Tensor,
+        cu_seqlens: Optional[torch.LongTensor] = None
+    ):
+        shape = x.shape
+        x = x.squeeze(0) if cu_seqlens is not None else x.squeeze(1)
+        if causal_conv1d_update is not None:
+            x = causal_conv1d_update(
+                x=x,
+                conv_state=cache,
+                weight=rearrange(self.weight, "d 1 w -> d w"),
+                bias=self.bias,
+                activation="silu",
+            )
+        else:
+            # we follow the fast mode that updates the cache in-place
+            cache.copy_(cache.roll(shifts=-1, dims=-1))
+            cache[:, :, -1] = x
+            x = torch.sum(cache * rearrange(self.weight, "d 1 w -> d w"), dim=-1)
+            if self.bias is not None:
+                x = x + self.bias
+            x = F.silu(x)
+        return x.view(shape), cache
+class HybridDragonAttentionDynamicCache(DynamicCache):
+    """
+    A dynamic cache that handle both the attention cache (which has a seq_len dimension) and the GDN cache
+    (which has a constant shape regardless of seq_len).
+    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
+    and `ssm_states` for GDN cache. The expected shape for each tensor is as follows:
+    For each layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
+    if local attention produce k and v
+    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
+    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
+    """
+    def __init__(self, config: DragonConfig, dtype=torch.bfloat16):
+        super().__init__()
+        self.config = config
+        self.dtype = dtype
+        self.q_conv_states = []
+        self.k_conv_states = []
+        self.v_conv_states = []
+        self.ssm_states = []
+        self._key_cache = {}
+        self._value_cache = {}
+        for idx, layer_type in enumerate(config.layers_config):
+            if layer_type in ['l', 'd']:
+                self._key_cache[idx] = None
+                self._value_cache[idx] = None
+            self.q_conv_states.append(None)
+            self.k_conv_states.append(None)
+            self.v_conv_states.append(None)
+            self.ssm_states.append(None)
+        self.window_size = config.sliding_window_size
+        self.layers_config = config.layers_config
+        self.past_length = [0 for _ in range(len(config.layers_config))]
+    def update(
+        self,
+        k: torch.Tensor, # (B, L, h, D)
+        v: torch.Tensor, # (B, L, h, D)
+        layer_idx: int,
+    ):
+        added_len = k.size(1)
+        # grab cache
+        k_cache = self._key_cache[layer_idx]
+        v_cache = self._value_cache[layer_idx]
+        if k_cache is None:
+            k_cache = k
+            v_cache = v
+        else:
+            k_cache = torch.cat([k_cache, k], dim=1)
+            v_cache = torch.cat([v_cache, v], dim=1)
+        # save cache
+        self._key_cache[layer_idx] = k_cache
+        self._value_cache[layer_idx] = v_cache
+        # update cache length
+        self.past_length[layer_idx] += added_len
+        return k_cache, v_cache
+    def trim(self, layer_idx: int):
+        # discard old keys/values
+        window_size = min(self.window_size, self.config.slw_wsize) if self.config.slw_wsize > 0 else self.window_size
+        if self.layers_config[layer_idx] == 'l':
+            if self._key_cache[layer_idx].size(1) > window_size:
+                self._key_cache[layer_idx] = self._key_cache[layer_idx][:, -window_size:, ...].contiguous()
+                self._value_cache[layer_idx] = self._value_cache[layer_idx][:, -window_size:, ...].contiguous()
+    def update_ssm_cache(
+        self,
+        q_conv_states: torch.Tensor,
+        k_conv_states: torch.Tensor,
+        v_conv_states: torch.Tensor,
+        ssm_states: torch.Tensor,
+        layer_idx: int,
+    ) -> None:
+        # Update the SSM cache
+        self.q_conv_states[layer_idx] = q_conv_states
+        self.k_conv_states[layer_idx] = k_conv_states
+        self.v_conv_states[layer_idx] = v_conv_states
+        self.ssm_states[layer_idx] = ssm_states
+    def get_ssm_cache(self, layer_idx: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Get the SSM cache for the specified layer
+        return (
+            self.q_conv_states[layer_idx],
+            self.k_conv_states[layer_idx],
+            self.v_conv_states[layer_idx],
+            self.ssm_states[layer_idx],
+        )
+    def get_total_seen(self, layer_idx: int) -> int:
+        return self.past_length[layer_idx]
+    def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor], Tuple[torch.Tensor]]:
+        raise NotImplementedError("HybridDragonAttentionDynamicCache does not have a legacy cache equivalent.")
+    @classmethod
+    def from_legacy_cache(cls, cache_params: Optional[Tuple[Tuple[torch.FloatTensor]]] = None) -> "DynamicCache":
+        raise NotImplementedError("HybridDragonAttentionDynamicCache does not have a legacy cache equivalent.")
+class DragonRotaryEmbedding(torch.nn.Module):
+    def __init__(self, config: DragonConfig, head_dim: int):
+        super().__init__()
+        self.config = config
+        inv_freq = 1.0 / (config.rope_theta ** (torch.arange(0, head_dim, 2).float() / head_dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.seq_len_cached = 0
+        self.cos_cached = None
+        self.sin_cached = None
+    def forward(self, x, position_ids):
+        max_pos = self.config.max_position_embeddings
+        if max_pos > self.seq_len_cached:
+            self.seq_len_cached = max(2 * max_pos, 16)
+            t = torch.arange(self.seq_len_cached, device=x.device, dtype=self.inv_freq.dtype)
+            freqs = torch.outer(t, self.inv_freq)
+            self.cos_cached = freqs.cos().to(torch.bfloat16)
+            self.sin_cached = freqs.sin().to(torch.bfloat16)
+        cos = self.cos_cached[position_ids] # (B, T, head_dim/2)
+        sin = self.sin_cached[position_ids]
+        cos = cos[..., None, :]       # (B, T, 1, head_dim/2), broadcasts over heads
+        sin = sin[..., None, :]
+        return cos, sin
+def apply_rotary_emb(x, cos, sin):
+    assert x.ndim == 4 # multihead attention
+    d = x.shape[3]//2 # head dim
+    x1 = x[..., :d]
+    x2 = x[..., d:]
+    y1 = x1 * cos + x2 * sin
+    y2 = x1 * (-sin) + x2 * cos
+    return torch.cat([y1, y2], 3).type_as(x)
+# heavily adapated from Gemma3
+def eager_attention_forward(
+    module: nn.Module, # TODO: remove module
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    causal: bool = True,
+    window_size: Optional[Tuple[int, int]] = None,
+    softcap: Optional[float] = None,
+    softmax_scale: Optional[float] = None,
+    **kwargs,
+) -> torch.Tensor:
+    if softmax_scale is None:
+        softmax_scale = module.head_dim**-0.5
+    query = query.transpose(1, 2) # (B, H, L, D)
+    key = key.transpose(1, 2) # (B, H, L, D)
+    value = value.transpose(1, 2) # (B, H, L, D)
+    key = key.repeat_interleave(module.num_heads // module.num_key_value_heads, dim=1)
+    value = value.repeat_interleave(module.num_heads // module.num_key_value_heads, dim=1)
+    attn_weights = torch.matmul(query, key.transpose(2, 3)) * softmax_scale
+    if softcap is not None:
+        attn_weights = torch.tanh(attn_weights / softcap) * softcap
+    if causal or (window_size is not None):
+        Lq = query.size(2)
+        Lk = key.size(2)
+        past = max(Lk - Lq, 0)
+        i = torch.arange(Lq, device=attn_weights.device).unsqueeze(1) + past # [Lq,1]
+        j = torch.arange(Lk, device=attn_weights.device).unsqueeze(0) # [1,Lk]
+        allowed = torch.ones((Lq, Lk), dtype=torch.bool, device=attn_weights.device)
+        if causal:
+            allowed &= (j <= i) # prevent attending to future positions
+        if window_size is not None:
+            w_left, w_right = window_size
+            # treat None as "no limit" on that side
+            if w_left is None:
+                w_left = Lk
+            if w_right is None:
+                w_right = Lk
+            allowed &= (j >= i - w_left) & (j <= i + w_right)
+        # broadcast [Lq,Lk] -> [B, H, Lq, Lk]
+        attn_weights = attn_weights.masked_fill(~allowed, float("-inf"))
+    # upcast attention to fp32
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output
+def get_query_key_value_tensors(module: nn.Module, hidden_states: torch.Tensor):
+    """
+    Derives `query`, `key` and `value` tensors from `hidden_states`.
+    """
+    # (B, L, D) -> (B, L, ng * (np/ng + 2) * hn))
+    mixed_qkv = module.linear_qkv(hidden_states)
+    if getattr(module, "reuse_kv", False):
+        # reshape to [..., num_query_groups, heads_per_group * d]
+        q_dim = (module.num_heads // module.num_key_value_heads) * module.head_dim
+        new_shape = mixed_qkv.size()[:-1] + (module.num_key_value_heads, q_dim)
+        query = mixed_qkv.view(*new_shape)
+        # final shape (B, L, H, d)
+        query = query.reshape(query.size(0), query.size(1), -1, module.head_dim)
+        return query
+    # (B, L, hp) -> (B, L, ng, (np/ng + 2) * hn)
+    new_tensor_shape = mixed_qkv.size()[:-1] + (
+        module.num_key_value_heads,
+        (
+            (module.num_heads // module.num_key_value_heads + 2)
+            * module.head_dim
+        ),
+    )
+    mixed_qkv = mixed_qkv.view(*new_tensor_shape)
+    split_arg_list = [
+        (
+            module.num_heads
+            // module.num_key_value_heads
+            * module.head_dim
+        ),
+        module.head_dim,
+        module.head_dim,
+    ]
+    # [B, L, ng, (np/ng + 2) * hn] -> [B, L, ng, np/ng * hn], [B, L, ng, hn], [B, L, ng, hn]
+    (query, key, value) = torch.split(mixed_qkv, split_arg_list, dim=3)
+    # [B, L, ng, np/ng * hn] -> [B, L, np, hn]
+    query = query.reshape(query.size(0), query.size(1), -1, module.head_dim)
+    return query, key, value
+class DragonAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper.
+    Modified to use sliding window attention: Longformer and "Generating Long Sequences with Sparse Transformers".
+    Doesn't include output projection: output is (B, L, H, D).
+    """
+    def __init__(self, config: DragonConfig, reuse_kv: bool, layer_idx: Optional[int], **kwargs):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+        self.num_heads = config.num_attention_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.hidden_size = config.hidden_size
+        self.projection_dim = config.hidden_size * config.expand_factor
+        self.head_dim = self.projection_dim // self.num_heads
+        self.rope_theta = config.rope_theta
+        self.qk_norm = config.qk_norm
+        self.window_size = config.sliding_window_size
+        self.reuse_kv = reuse_kv
+        projection_dim = self.head_dim * (self.num_heads + 2 * (0 if reuse_kv else self.num_key_value_heads))
+        self.linear_qkv = DragonLinear(config, config.hidden_size, projection_dim, bias=False)
+        if self.qk_norm:
+            self.q_norm = DragonRMSNorm(self.head_dim, eps=config.norm_epsilon)
+            if not reuse_kv:
+                self.k_norm = DragonRMSNorm(self.head_dim, eps=config.norm_epsilon)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        position_ids: Optional[torch.LongTensor] = None,
+        cache_params: Optional[HybridDragonAttentionDynamicCache] = None,
+        key_value_last_layer: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs,
+    ):
+        # Q, K, V projections.
+        if not self.reuse_kv:
+            query_states, key_states, value_states = get_query_key_value_tensors(self, hidden_states)
+        else:
+            query_states = get_query_key_value_tensors(self, hidden_states)
+            key_states, value_states = key_value_last_layer
+            last_key_states, last_value_states = None, None
+        # QK-norm.
+        if self.qk_norm:
+            query_states = self.q_norm(query_states)
+            if not self.reuse_kv:
+                key_states = self.k_norm(key_states)
+        # RoPE.
+        cos, sin = position_embeddings
+        query_states = apply_rotary_emb(query_states, cos, sin)
+        if not self.reuse_kv:
+            key_states = apply_rotary_emb(key_states, cos, sin)
+        # KV-cache.
+        if not self.reuse_kv and cache_params is not None:
+            key_states, value_states = cache_params.update(key_states, value_states, self.layer_idx)
+        # save k,v for next layer (*after* norm and RoPE and kv-cache update)
+        if not self.reuse_kv:
+            last_key_states, last_value_states = key_states, value_states
+        # attention computation. # TODO: do that in init ?
+        if ATTN_IMPL == "eager":
+            attention_interface = lambda q, k, v, **kw: eager_attention_forward(self, q, k, v, **kw)
+        elif ATTN_IMPL == "fa2":
+            attention_interface = lambda q, k, v, **kw: flash_attn_func(q, k, v, **kw)
+        elif ATTN_IMPL == "fa3":
+            attention_interface = lambda q, k, v, **kw: flash_attn_func(q, k, v, **kw)[0]
+        else:
+            raise ValueError(f"Unknown ATTN_IMPL: {ATTN_IMPL}")
+        attn_output = attention_interface(
+            query_states.bfloat16(),
+            key_states.bfloat16(),
+            value_states.bfloat16(),
+            causal=True,
+            window_size=(min(self.window_size, self.config.slw_wsize) if self.config.slw_wsize > 0 else self.window_size, 0),
+            softcap=self.config.softcap_local_attn,
+            softmax_scale=None if not self.config.use_uscaling else 1/self.head_dim,
+            **kwargs,
+        )
+        if cache_params is not None and not self.reuse_kv:
+            cache_params.trim(self.layer_idx)
+        return attn_output, last_key_states, last_value_states
+# heavily adapted from official differential attention implementation
+"""def eager_differential_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    causal: bool = True,
+    window_size: Optional[Tuple[int, int]] = None,
+    softcap: Optional[float] = None,
+    softmax_scale: Optional[float] = None,
+    **kwargs,
+) -> torch.Tensor:
+    if softmax_scale is None:
+        softmax_scale = module.head_dim ** -0.5
+    B, H2, Lq, Dh = query.shape # H2 = 2 * H
+    H = module.num_heads
+    Hkv = module.num_key_value_heads
+    assert H2 == 2 * H, "query must have 2*num_heads heads"
+    assert key.shape[-1] == Dh, "key head_dim must match query"
+    assert value.shape[-1] == 2 * Dh, "value must have 2*head_dim"
+    # repeat K to 2H (for the two "channels") and V to H (final combined heads)
+    n_rep = H // Hkv
+    k_2H = repeat_kv(key, 2 * n_rep) # [B, 2H, Lk, Dh]
+    v_H  = repeat_kv(value, n_rep) # [B,  H, Lk, 2Dh]
+    # raw attention logits for the 2 channels
+    attn_weights = torch.matmul(query, k_2H.transpose(2, 3)) * softmax_scale  # [B, 2H, Lq, Lk]
+    if softcap is not None:
+        attn_weights = torch.tanh(attn_weights / softcap) * softcap
+    # masking (causal and/or sliding window)
+    if causal or (window_size is not None):
+        Lk = k_2H.size(2)
+        i = torch.arange(Lq, device=attn_weights.device).unsqueeze(1)  # [Lq,1]
+        j = torch.arange(Lk, device=attn_weights.device).unsqueeze(0)  # [1,Lk]
+        allowed = torch.ones((Lq, Lk), dtype=torch.bool, device=attn_weights.device)
+        if causal:
+            allowed &= (j <= i)
+        if window_size is not None:
+            w_left, w_right = window_size
+            if w_left is None:  w_left = Lk
+            if w_right is None: w_right = Lk
+            allowed &= (j >= i - w_left) & (j <= i + w_right)
+        attn_weights = attn_weights.masked_fill(~allowed, float("-inf"))
+    # softmax in fp32 then cast back
+    attn_probs = torch.nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) # [B,2H,Lq,Lk]
+    # reshape to [B, H, 2, Lq, Lk] and combine the two channels with learned lambda
+    attn_probs = attn_probs.view(B, H, 2, Lq, -1)  # -1 = Lk
+    # per-head scalar lambdas: exp(<λ_q1,λ_k1>) - exp(<λ_q2,λ_k2>) + λ_init
+    lambda_1 = torch.exp(torch.sum(module.lambda_q1 * module.lambda_k1, dim=-1).float()).to(query.dtype)  # [H]
+    lambda_2 = torch.exp(torch.sum(module.lambda_q2 * module.lambda_k2, dim=-1).float()).to(query.dtype)  # [H]
+    lambda_full = (lambda_1 - lambda_2 + module.lambda_init).view(1, H, 1, 1)  # [1,H,1,1] for broadcast
+    combined_probs = attn_probs[:, :, 0] - lambda_full * attn_probs[:, :, 1]  # [B,H,Lq,Lk]
+    # weighted sum over V (note: V has 2*Dh per head)
+    attn = torch.matmul(combined_probs, v_H)  # [B,H,Lq,2Dh]
+    # sub-layer norm (or similar) then final scaling
+    attn = module.subln(attn)
+    attn = attn * (1 - module.lambda_init)
+    # (B,Lq,H*2Dh)
+    attn = attn.transpose(1, 2).contiguous().view(B, Lq, H * 2 * Dh)
+    return attn"""
+class DragonDifferentialAttention(nn.Module):
+    """
+    Multi-headed differential attention (https://arxiv.org/abs/2410.05258)
+    """
+    def __init__(self, config: DragonConfig, layer_idx: Optional[int], **kwargs):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+        self.num_heads = config.num_attention_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.hidden_size = config.hidden_size
+        self.head_dim = config.hidden_size * config.expand_factor // self.num_heads
+        self.qk_norm = config.qk_norm
+        self.softcap = config.softcap_global_attn
+        self.scalable_softmax = config.scalable_softmax
+        projection_dim = self.head_dim * (self.num_heads + 2 * self.num_key_value_heads)
+        self.linear_qkv = DragonLinear(config, config.hidden_size, projection_dim, bias=False)
+        if self.qk_norm:
+            self.q_norm = DragonRMSNorm(self.head_dim, eps=config.norm_epsilon)
+            self.k_norm = DragonRMSNorm(self.head_dim, eps=config.norm_epsilon)
+        if self.scalable_softmax:
+            self.softmax_scaler = nn.Parameter(torch.ones(self.num_heads, dtype=torch.float32))
+        self.register_buffer("lambda_init", torch.tensor(0.8 - 0.6 * math.exp(-0.3 * (layer_idx+1))), persistent=False)
+        self.lambda_q1 = torch.nn.Parameter(torch.zeros(self.head_dim//2, dtype=torch.float32).normal_(mean=0,std=0.1))
+        self.lambda_k1 = torch.nn.Parameter(torch.zeros(self.head_dim//2, dtype=torch.float32).normal_(mean=0,std=0.1))
+        self.lambda_q2 = torch.nn.Parameter(torch.zeros(self.head_dim//2, dtype=torch.float32).normal_(mean=0,std=0.1))
+        self.lambda_k2 = torch.nn.Parameter(torch.zeros(self.head_dim//2, dtype=torch.float32).normal_(mean=0,std=0.1))
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: Optional[torch.LongTensor] = None,
+        cache_params: Optional[HybridDragonAttentionDynamicCache] = None,
+        **kwargs,
+    ):
+        # Q, K, V projections.
+        query_states, key_states, value_states = get_query_key_value_tensors(self, hidden_states)
+        value_states = value_states.reshape(value_states.size(0), value_states.size(1), value_states.size(2)//2, 2*value_states.size(3))
+        # QK-norm.
+        if self.qk_norm:
+            query_states = self.q_norm(query_states)
+            key_states = self.k_norm(key_states)
+        # scalable softmax.
+        if self.scalable_softmax:
+            # scalable-softmax (https://arxiv.org/abs/2501.19399): multiply q by s*log(n)
+            T = query_states.size(1)
+            pos = (position_ids.to(torch.float32).view(position_ids.size(0), T, 1, 1) + 1.)
+            log_pos = pos.log() if self.config.slw_wsize <= 0 else torch.clamp_max(pos, self.config.slw_wsize).log()
+            query_states = (self.softmax_scaler.view(1, 1, -1, 1) * log_pos) * query_states
+            # TODO: caching mechanism for log_pos
+        # KV-cache.
+        if cache_params is not None:
+            key_states, value_states = cache_params.update(key_states, value_states, self.layer_idx)
+        # attention computation.
+        # split q,k heads into two groups
+        query1_states, query2_states = query_states[:, :, torch.arange(0, self.num_heads, 2)].contiguous(), query_states[:, :, torch.arange(1, self.num_heads, 2)].contiguous()
+        key1_states, key2_states = key_states[:, :, torch.arange(0, self.num_key_value_heads, 2)].contiguous(), key_states[:, :, torch.arange(1, self.num_key_value_heads, 2)].contiguous()
+        # compute
+        # TODO: do that in init ?
+        if DIFF_ATTN_IMPL == "flex_head":
+            def diff_attention_interface(q, k, v, **kw):
+                return flex_head_fa.flash_attn_func(q, k, v, **kw)
+        elif DIFF_ATTN_IMPL == "fa2":
+            def diff_attention_interface(q, k, v, **kw):
+                D = v.size(3)
+                v1 = v[:, :, :, :D//2]#.contiguous()
+                v2 = v[:, :, :, D//2:]#.contiguous()
+                o1 = flash_attn_func(q, k, v1, **kw)
+                o2 = flash_attn_func(q, k, v2, **kw)
+                o = torch.cat([o1, o2], dim=-1)
+                return o
+        elif DIFF_ATTN_IMPL == "fa3":
+            def diff_attention_interface(q, k, v, **kw):
+                D = v.size(3)
+                v1 = v[:, :, :, :D//2]#.contiguous()
+                v2 = v[:, :, :, D//2:]#.contiguous()
+                o1 = flash_attn_func(q, k, v1, **kw)[0]
+                o2 = flash_attn_func(q, k, v2, **kw)[0]
+                o = torch.cat([o1, o2], dim=-1)
+                return o
+        elif DIFF_ATTN_IMPL == "eager":
+            def diff_attention_interface(q, k, v, **kw):
+                D = v.size(3)
+                v1 = v[:, :, :, :D//2]#.contiguous()
+                v2 = v[:, :, :, D//2:]#.contiguous()
+                o1 = eager_attention_forward(self, q, k, v1, **kw)
+                o2 = eager_attention_forward(self, q, k, v2, **kw)
+                o = torch.cat([o1, o2], dim=-1)
+                return o
+        y1 = diff_attention_interface(
+            query1_states.bfloat16(),
+            key1_states.bfloat16(),
+            value_states.bfloat16(),
+            causal=True,
+            window_size=(self.config.slw_wsize, 0),
+            softcap=self.softcap,
+            softmax_scale=None if not self.config.use_uscaling else 1/self.head_dim)
+        y2 = diff_attention_interface(
+            query2_states.bfloat16(),
+            key2_states.bfloat16(),
+            value_states.bfloat16(),
+            causal=True,
+            window_size=(self.config.slw_wsize, 0),
+            softcap=self.softcap,
+            softmax_scale=None if not self.config.use_uscaling else 1/self.head_dim)
+        lambda_1 = torch.exp((self.lambda_q1 * self.lambda_k1).sum(-1).float()) # (H/2)
+        lambda_2 = torch.exp((self.lambda_q2 * self.lambda_k2).sum(-1).float()) # (H/2)
+        lambda_full = (lambda_1 - lambda_2 + self.lambda_init).view(1, 1, -1, 1).type_as(y1)
+        attn_output = (y1 - lambda_full * y2).contiguous()
+        if cache_params is not None:
+            cache_params.trim(self.layer_idx)
+        return attn_output, None, None
+class DragonGatedDeltaNet(nn.Module):
+    def __init__(self, config: DragonConfig, layer_idx: Optional[int], **kwargs):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+        self.conv_size = config.conv_kernel
+        self.conv_bias = config.use_bias
+        self.n_heads = config.num_attention_heads
+        self.n_heads_local = self.n_heads // 1
+        self.d_head = int(config.hidden_size * (config.expand_factor/2)) // self.n_heads
+        self.key_dim = int(self.n_heads * self.d_head)
+        self.value_dim = int(2*self.key_dim) # todo refactor
+        self.head_k_dim = self.d_head
+        self.head_v_dim = int(2*self.d_head)
+        self.silu = nn.SiLU()
+        self.dk = self.head_k_dim
+        self.dv = self.head_v_dim # todo : duplicate variables
+        self.per_head_proj = 2*self.dk + self.dv + 2 # [q k v b a] per head
+        in_proj_dim_global = self.n_heads * self.per_head_proj
+        # todo: rename d_head => head_dim (for consistency with other classes)
+        self.in_proj = DragonLinear(config, config.hidden_size, in_proj_dim_global, bias=False)
+        dt_min = config.time_step_min
+        dt_max = config.time_step_max
+        dt_init_floor = config.time_step_floor
+        A_init_range = config.A_init_range
+        # Initialize dt bias so that F.softplus(dt_bias) is between dt_min and dt_max
+        dt = torch.exp(
+            torch.rand(self.n_heads_local) * (math.log(dt_max) - math.log(dt_min))
+            + math.log(dt_min)
+        )
+        dt = torch.clamp(dt, min=dt_init_floor)
+        # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
+        inv_dt = dt + torch.log(-torch.expm1(-dt))
+        with torch.no_grad():
+            self.dt_bias = nn.Parameter(inv_dt)
+        assert A_init_range[0] > 0 and A_init_range[1] >= A_init_range[0]
+        A = torch.empty(
+            self.n_heads_local, dtype=torch.float32, device=torch.cuda.current_device()
+        ).uniform_(*A_init_range)
+        A_log = torch.log(A)  # Keep A_log in fp32
+        self.A_log = nn.Parameter(A_log)
+        self.q_conv1d = DragonConv1D(
+                hidden_size=self.key_dim,
+                kernel_size=self.conv_size,
+            )
+        self.k_conv1d = DragonConv1D(
+                hidden_size=self.key_dim,
+                kernel_size=self.conv_size,
+            )
+        self.v_conv1d = DragonConv1D(
+                hidden_size=self.value_dim,
+                kernel_size=self.conv_size,
+            )
+        self.g_proj = DragonLinear(config, config.hidden_size, config.hidden_size*config.expand_factor, bias=False)
+        self.act_func_gate = F.silu
+    def forward(self,
+                hidden_states: torch.Tensor,
+                cache_params: Optional[HybridDragonAttentionDynamicCache] = None,
+    ):
+        _, q_len, _ = hidden_states.shape
+        mode = 'fused_recurrent' if q_len <= 64 else 'chunk'
+        if self.training:
+            assert mode == 'chunk', "Only chunk mode is supported in training."
+        # input projection (TP-aware)
+        qkvba = self.in_proj(hidden_states) # (l, b, H_local * per_head_proj)
+        # [L,B,(H*P)] -> [B,L,H,P]
+        qkvba = rearrange(qkvba, "b l (h p) -> b l h p", h=self.n_heads_local).contiguous()
+        # split per head: [B,L,H,dk/dk/dv/1/1]
+        q_proj = qkvba[..., 0:self.dk]
+        k_proj = qkvba[..., self.dk:2*self.dk]
+        v_proj = qkvba[..., 2*self.dk:2*self.dk+self.dv]
+        b_proj = qkvba[..., 2*self.dk+self.dv:2*self.dk+self.dv+1]
+        a_proj = qkvba[..., 2*self.dk+self.dv+1:]
+        # concat for conv
+        q_proj = rearrange(q_proj, "b l h d -> b l (h d)")
+        k_proj = rearrange(k_proj, "b l h d -> b l (h d)")
+        v_proj = rearrange(v_proj, "b l h d -> b l (h d)")
+        b_proj = rearrange(b_proj, "b l h d -> b l (h d)") # d=1
+        a_proj = rearrange(a_proj, "b l h d -> b l (h d)")
+        q_conv_cache, k_conv_cache, v_conv_cache, ssm_cache = (None, None, None, None)
+        if cache_params is not None:
+            q_conv_cache, k_conv_cache, v_conv_cache, ssm_cache = cache_params.get_ssm_cache(self.layer_idx)
+        q, q_conv_cache = self.q_conv1d(
+            x=q_proj,
+            mask=None,
+            cache=q_conv_cache,
+            output_final_state=(cache_params is not None))
+        k, k_conv_cache = self.k_conv1d(
+            x=k_proj,
+            mask=None,
+            cache=k_conv_cache,
+            output_final_state=(cache_params is not None))
+        v, v_conv_cache = self.v_conv1d(
+            x=v_proj,
+            mask=None,
+            cache=v_conv_cache,
+            output_final_state=(cache_params is not None))
+        # back to per-head for kernels
+        q = rearrange(q, "b l (h d) -> b l h d", d=self.dk)
+        k = rearrange(k, "b l (h d) -> b l h d", d=self.dk)
+        v = rearrange(v, "b l (h d) -> b l h d", d=self.dv)
+        beta = b_proj.sigmoid()
+        g = -self.A_log.float().exp() * F.softplus(a_proj.float() + self.dt_bias)
+        if mode == 'chunk':
+            if chunk_gated_delta_rule is not None:
+                o, ssm_cache = chunk_gated_delta_rule(
+                    q=q.bfloat16(),
+                    k=k.bfloat16(),
+                    v=v.bfloat16(),
+                    g=g,
+                    beta=beta,
+                    scale=None if not self.config.use_uscaling else 1/self.head_k_dim,
+                    initial_state=ssm_cache,
+                    output_final_state=(cache_params is not None),
+                    cu_seqlens=None, # for varlen training
+                    head_first=False,
+                    use_qk_l2norm_in_kernel=True
+                ) # (B L H D) where d is head_v_dim
+            else:
+                raise NotImplementedError("PyTorch implementation of chunked GDN is not available.")
+        elif mode == 'fused_recurrent':
+            if fused_recurrent_gated_delta_rule is not None:
+                o, ssm_cache = fused_recurrent_gated_delta_rule(
+                    q=q.bfloat16(),
+                    k=k.bfloat16(),
+                    v=v.bfloat16(),
+                    g=g,
+                    beta=beta,
+                    scale=None if not self.config.use_uscaling else 1/self.head_k_dim,
+                    initial_state=ssm_cache,
+                    output_final_state=(cache_params is not None),
+                    cu_seqlens=None,
+                    use_qk_l2norm_in_kernel=True
+                ) # (B L H D) where d is head_v_dim
+            else:
+                raise NotImplementedError("PyTorch implementation of recurrent GDN is not available.")
+        else:
+            raise NotImplementedError(f"Not supported mode `{mode}`.")
+        g = self.g_proj(hidden_states).view(o.size(0), o.size(1), o.size(2), o.size(3)) # (B, L, H, D)
+        o = o * self.act_func_gate(g)
+        if cache_params is not None:
+            cache_params.update_ssm_cache(
+                q_conv_states=q_conv_cache,
+                k_conv_states=k_conv_cache,
+                v_conv_states=v_conv_cache,
+                ssm_states=ssm_cache,
+                layer_idx=self.layer_idx,
+            )
+        return o
+class DragonMLP(nn.Module):
+    def __init__(self, config: DragonConfig):
+        super().__init__()
+        self.fc_1 = DragonLinear(config, config.hidden_size, config.intermediate_size, bias=False)
+        self.fc_2 = DragonLinear(config, config.intermediate_size, config.hidden_size, bias=False)
+        self.register_buffer("_2_sqrt_5", torch.tensor(2/math.sqrt(5)) if config.use_uscaling else torch.tensor(1.), persistent=False)
+    def forward(self, hidden_states):
+        hidden_states = self.fc_1(hidden_states)
+        hidden_states = self._2_sqrt_5 * F.relu(hidden_states).square()
+        hidden_states = self.fc_2(hidden_states)
+        return hidden_states
+class DragonBlock(GradientCheckpointingLayer):
+    def __init__(self, config: DragonConfig, layer_idx: int, layer_type: str):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.expand_factor = config.expand_factor
+        if layer_type in ['l', 'r']:
+            self.attn = DragonAttention(config, reuse_kv=(layer_type=='r'), layer_idx=layer_idx)
+        elif layer_type == 'd':
+            self.attn = DragonDifferentialAttention(config, layer_idx=layer_idx)
+        else:
+            raise ValueError(f"Unknown layer type: {layer_type}")
+        self.lin_attn = DragonGatedDeltaNet(config, layer_idx=layer_idx)
+        self.mixer_proj = DragonLinear(config, int(self.expand_factor*config.hidden_size), config.hidden_size, bias=False)
+        if isinstance(self.attn, (DragonDifferentialAttention)):
+            self.attn_group_norm = DragonHeadWiseRMSNorm(n_heads=self.attn.num_heads//2, d_head=2*self.attn.head_dim, eps=config.norm_epsilon)
+        else:
+            self.attn_group_norm = DragonHeadWiseRMSNorm(n_heads=self.attn.num_heads, d_head=self.attn.head_dim, eps=config.norm_epsilon)
+        self.lin_attn_group_norm = DragonHeadWiseRMSNorm(n_heads=self.lin_attn.n_heads, d_head=self.lin_attn.head_v_dim, eps=config.norm_epsilon)
+        self.input_norm = DragonRMSNorm(config.hidden_size, eps=config.norm_epsilon)
+        self.postmixer_norm = DragonRMSNorm(config.hidden_size, eps=config.norm_epsilon)
+        self.mlp = DragonMLP(config)
+        self.register_buffer("lns", torch.tensor(1.0 if config.use_uscaling else 1. / math.sqrt(layer_idx + (2 if config.old_lns else 1))), persistent=False)
+        self.register_buffer("sqrt_2_2", torch.tensor(math.sqrt(2)/2) if config.use_uscaling else torch.tensor(1/2), persistent=False)
+        self.register_buffer("sqrt_tau", torch.sqrt(torch.tensor(self.config.uscaling_tau)) if config.use_uscaling else torch.tensor(1.0), persistent=False)
+        self.register_buffer("sqrt_one_minus_tau", torch.sqrt(torch.tensor(1.0 - self.config.uscaling_tau)) if config.use_uscaling else torch.tensor(1.0), persistent=False)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: Optional[torch.LongTensor] = None,
+        cache_params: Optional[HybridDragonAttentionDynamicCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        key_value_last_layer: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs,
+    ):
+        # MIXER.
+        residual = hidden_states
+        hidden_states = self.lns * self.input_norm(hidden_states) # (B, L, D)
+        y_attn, last_key_states, last_value_states = self.attn(
+            hidden_states=hidden_states,
+            position_embeddings=position_embeddings,
+            position_ids=position_ids,
+            cache_params=cache_params,
+            key_value_last_layer=key_value_last_layer,
+        ) # (B, L, E*D)
+        y_lin_attn = self.lin_attn(
+            hidden_states=hidden_states,
+            cache_params=cache_params,
+        ) # (B, L, E*D)
+        y_attn = self.attn_group_norm(y_attn).view(y_attn.size(0), y_attn.size(1), -1)
+        y_lin_attn = self.lin_attn_group_norm(y_lin_attn).view(y_lin_attn.size(0), y_lin_attn.size(1), -1)
+        y_mixer = self.mixer_proj(self.sqrt_2_2 * (y_attn + y_lin_attn))
+        hidden_states = self.sqrt_one_minus_tau * residual + self.sqrt_tau * y_mixer
+        # MLP.
+        residual = hidden_states
+        hidden_states = self.lns * self.postmixer_norm(hidden_states)
+        y_mlp = self.mlp(hidden_states) # (B, L, D)
+        hidden_states = self.sqrt_one_minus_tau * residual + self.sqrt_tau * y_mlp
+        return hidden_states, last_key_states, last_value_states
+class DragonPreTrainedModel(PreTrainedModel):
+    config: DragonConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["DragonBlock"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": DragonBlock,
+        "attentions": DragonBlock,
+    }
+    def _init_weights(self, module):
+        if isinstance(module, (DragonLinear, DragonConv1D)):
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+            nn.init.normal_(module.weight, mean=0., std=1. if self.config.use_uscaling else 0.006)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, mean=0., std=1. if self.config.use_uscaling else 0.006)
+@dataclass
+class DragonOutput(ModelOutput):
+    """
+    Class for the Dragon model outputs.
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        cache_params (`HybridDragonAttentionDynamicCache`):
+            The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
+            avoid providing the old `input_ids`.
+            Includes both the RNN-like state matrices after the selective scan, and the conv states
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+    """
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[HybridDragonAttentionDynamicCache] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+@dataclass
+class DragonCausalLMOutput(ModelOutput):
+    """
+    Base class for causal language model (or autoregressive) outputs.
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        cache_params (`HybridDragonAttentionDynamicCache`):
+            The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
+            avoid providing the old `input_ids`.
+            Includes both the State space model state matrices after the selective scan, and the Convolutional states
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[HybridDragonAttentionDynamicCache] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+class DragonModel(DragonPreTrainedModel):
+    def __init__(self, config: DragonConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embedding = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList([DragonBlock(config, layer_idx=i, layer_type=layer) for i, layer in enumerate(config.layers_config)])
+        self.rotary_emb = DragonRotaryEmbedding(config, head_dim=(config.expand_factor*config.hidden_size)//config.num_attention_heads) # only for SWA
+        self.final_norm = DragonRMSNorm(config.hidden_size, eps=config.norm_epsilon)
+        alpha_fwd_out = 1. / float(self.config.hidden_size) if self.config.use_uscaling else 1.0
+        alpha_bwd_out = 1. / math.sqrt(float(self.config.hidden_size)) if self.config.use_uscaling else 1.0
+        self.register_buffer("alpha_fwd_out", torch.tensor(alpha_fwd_out), persistent=False)
+        self.register_buffer("alpha_bwd_out", torch.tensor(alpha_bwd_out), persistent=False)
+        self.gradient_checkpointing = False
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embedding
+    def set_input_embeddings(self, new_embeddings):
+        self.embedding = new_embeddings
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        past_key_values: Optional[HybridDragonAttentionDynamicCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        **kwargs
+    ) -> DragonOutput:
+        use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False)
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if inputs_embeds is None:
+            inputs_embeds = self.embedding(input_ids)
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+        if use_cache:
+            if past_key_values is None:
+                past_key_values = HybridDragonAttentionDynamicCache(self.config, dtype=self.dtype)
+            elif not isinstance(past_key_values, HybridDragonAttentionDynamicCache):
+                # recreate (todo: upcast instead of recreate)
+                if type(past_key_values) is DynamicCache:
+                    print("upgrading DynamicCache → HybridDragonAttentionDynamicCache")
+                    past_key_values = HybridDragonAttentionDynamicCache(self.config, dtype=self.dtype)
+                else:
+                    raise TypeError(f"Unsupported cache type: {type(past_key_values)}")
+        hidden_states = inputs_embeds
+        if cache_position is None:
+            cache_position = torch.arange(hidden_states.shape[1], device=hidden_states.device)
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        all_hidden_states = () if output_hidden_states else None
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        shared_kv = (None, None)
+        for block in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            hidden_states, last_k, last_v = block(
+                hidden_states,
+                position_ids=position_ids,
+                cache_params=past_key_values,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                key_value_last_layer=shared_kv,
+                **kwargs,
+            )
+            shared_kv = (last_k, last_v)
+        hidden_states = self.final_norm(hidden_states)
+        hidden_states = _ScaleFB.apply(hidden_states, self.alpha_fwd_out, self.alpha_bwd_out)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        return DragonOutput(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+            hidden_states=all_hidden_states,
+        )
+DragonModel.register_for_auto_class("AutoModel")
+class DragonForCausalLM(DragonPreTrainedModel, GenerationMixin):
+    def __init__(self, config: DragonConfig):
+        super().__init__(config)
+        self.model = DragonModel(config)
+        self.vocab_size = config.vocab_size
+        #self.lm_head = DragonLinear(config, config.hidden_size, config.vocab_size, bias=False, alpha_fwd=1/config.hidden_size, alpha_bwd=1/math.sqrt(config.hidden_size))
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.post_init()
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        use_cache: Optional[bool] = None,
+        past_key_values: Optional[HybridDragonAttentionDynamicCache] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> DragonCausalLMOutput:
+        output_hidden_states = (output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states)
+        outputs: DragonOutput = self.model(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            use_cache=use_cache,
+            past_key_values=past_key_values,
+            cache_position=cache_position,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            **kwargs,
+        )
+        hidden_states = outputs.last_hidden_state
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states.to(self.lm_head.weight.dtype)[:, slice_indices, :]).float()
+        loss = None
+        if labels is not None:
+            # move labels to correct device
+            labels = labels.to(logits.device)
+            # shift
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # compute loss
+            loss = F.cross_entropy(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1), ignore_index=self.model.padding_idx)
+        return DragonCausalLMOutput(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+        )
+DragonForCausalLM.register_for_auto_class("AutoModelForCausalLM")
+__all__ = ["DragonModel", "DragonForCausalLM", "DragonPreTrainedModel"]

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|endoftext|>",
+  "pad_token": "<|endoftext|>"
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:14e265037e4c487453fc3c1238f3cd2a5883f4b7a1c83e1da2dd154492f2527b
+size 14941185

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,219 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "17": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "18": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "19": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "20": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "21": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "22": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "23": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "24": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "25": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "PreTrainedTokenizerFast"
+}