Aleph-Alpha
/

tfree-hat-pretrained-7b-base

@@ -69,10 +69,10 @@ We do not collect PII (personally identifiable information) for any of these cha
 ## Inference
-We provide an inference module compatible with HuggingFace Transformers for running model inference. We recommend pinning the transformers library to version 4.46.3. Before executing the inference example below, make sure the [hat-splitter package](https://pypi.org/project/hat-splitter/) is installed in your environment.
 ```shell
-pip install 'hat-splitter>=0.1.9' 'transformers==4.46.3' torch
 pip install flash_attn
 ```
@@ -82,13 +82,12 @@ Download model weights and run inference using the following example:
 import torch
 from transformers import AutoModelForCausalLM
 INPUT ="When was Rome founded?"
-MODEL_ID = "Aleph-Alpha/TFree-HAT-Pretrained-7B-Base"
 model = AutoModelForCausalLM.from_pretrained(
     trust_remote_code=True,
-    pretrained_model_name_or_path=MODEL_ID,
-    attn_implementation="flash_attention_2",
 ).to("cuda", torch.bfloat16)
-input_ids, cumulative_word_lengths = model._prepare_input(INPUT)
 model_output = model.generate(
     input_ids,
     cumulative_seq_lengths_per_word=cumulative_word_lengths,
@@ -99,8 +98,7 @@ print("Prompt: ", INPUT)
 print("Completion: ", model_output.completion_text)
 ```
-Please note that the realized inference speed strongly depends on the maturity of the inference implementation beyond the intrinsic text compression of any model. Besides this huggingface transformers-based inference solution, we are also releasing a [vLLM-based inference solution](https://github.com/Aleph-Alpha/vllm) for our models that is optimized for batched inference. Please note that this vLLM inference for HAT is still under active development.
 # Evaluation

 ## Inference
+We provide an inference module compatible with HuggingFace Transformers for running model inference. Before executing the inference example below, make sure the [hat-splitter package](https://pypi.org/project/hat-splitter/) is installed in your environment.
 ```shell
+pip install 'hat-splitter>=0.1.9' transformers torch
 pip install flash_attn
 ```
 import torch
 from transformers import AutoModelForCausalLM
 INPUT ="When was Rome founded?"
+MODEL_ID = "Aleph-Alpha/llama-tfree-hat-pretrained-7b-base"
 model = AutoModelForCausalLM.from_pretrained(
     trust_remote_code=True,
+    pretrained_model_name_or_path=MODEL_ID
 ).to("cuda", torch.bfloat16)
+input_ids, cumulative_word_lengths = model._prepare_input(INPUT, add_llama_template=True)
 model_output = model.generate(
     input_ids,
     cumulative_seq_lengths_per_word=cumulative_word_lengths,
 print("Completion: ", model_output.completion_text)
 ```
+Please note that the realized inference speed strongly depends on the maturity of the inference implementation beyond the intrinsic text compression of any model. Besides this huggingface transformers-based inference solution, we are also releasing a [vLLM-based inference solution](https://github.com/Aleph-Alpha/vllm) for our models that is optimized for batched inference. We are still waiting for a review on our PR into VLLM, but we strongly encourage to use the optimized inference provided through VLLM.
 # Evaluation

config.json CHANGED Viewed

@@ -12,7 +12,7 @@
     "is_neox_style": true,
     "key_query_norm": true,
     "key_query_norm_per_head": true,
-    "max_position_embeddings": 3500,
     "mlp_bias": false,
     "num_attention_heads": 32,
     "num_hidden_layers": 32,
@@ -44,7 +44,7 @@
     "is_neox_style": true,
     "key_query_norm": true,
     "key_query_norm_per_head": true,
-    "max_position_embeddings": 28000,
     "mlp_bias": false,
     "num_attention_heads": 8,
     "num_hidden_layers": 4,
@@ -75,7 +75,7 @@
     "is_neox_style": true,
     "key_query_norm": true,
     "key_query_norm_per_head": true,
-    "max_position_embeddings": 28000,
     "mlp_bias": false,
     "num_attention_heads": 8,
     "num_hidden_layers": 6,
@@ -90,7 +90,7 @@
     "use_cache": true,
     "vocab_size": 256
   },
-  "max_position_embeddings": 28000,
   "max_word_size": 100,
   "model_type": "hierarchical_autoregressive_transformer",
   "sliding_window": 768,

     "is_neox_style": true,
     "key_query_norm": true,
     "key_query_norm_per_head": true,
+    "max_position_embeddings": 32900,
     "mlp_bias": false,
     "num_attention_heads": 32,
     "num_hidden_layers": 32,
     "is_neox_style": true,
     "key_query_norm": true,
     "key_query_norm_per_head": true,
+    "max_position_embeddings": 262144,
     "mlp_bias": false,
     "num_attention_heads": 8,
     "num_hidden_layers": 4,
     "is_neox_style": true,
     "key_query_norm": true,
     "key_query_norm_per_head": true,
+    "max_position_embeddings": 262144,
     "mlp_bias": false,
     "num_attention_heads": 8,
     "num_hidden_layers": 6,
     "use_cache": true,
     "vocab_size": 256
   },
+  "max_position_embeddings": 262144,
   "max_word_size": 100,
   "model_type": "hierarchical_autoregressive_transformer",
   "sliding_window": 768,

config.yaml CHANGED Viewed

@@ -6,7 +6,7 @@ encoder_config:
   num_key_value_heads: 8
   rms_norm_eps: 1.0e-05
   intermediate_size: 2816
-  max_position_embeddings: 28000
   rope_scaling:
     rope_type: default
   rope_theta: 100000
@@ -34,7 +34,7 @@ backbone_config:
   num_key_value_heads: 8
   rms_norm_eps: 1.0e-05
   intermediate_size: 14336
-  max_position_embeddings: 3500
   rope_scaling:
     rope_type: default
   rope_theta: 500000
@@ -53,7 +53,7 @@ decoder_config:
   num_key_value_heads: 8
   rms_norm_eps: 1.0e-05
   intermediate_size: 2816
-  max_position_embeddings: 28000
   rope_scaling:
     rope_type: default
   rope_theta: 100000
@@ -82,7 +82,7 @@ auto_map:
 special_token_dict: {}
 max_word_size: 100
 sliding_window: 768
-max_position_embeddings: 28000
 torch_dtype: bfloat16
 architectures:
 - HATDecoderForCausalLM

   num_key_value_heads: 8
   rms_norm_eps: 1.0e-05
   intermediate_size: 2816
+  max_position_embeddings: 262144
   rope_scaling:
     rope_type: default
   rope_theta: 100000
   num_key_value_heads: 8
   rms_norm_eps: 1.0e-05
   intermediate_size: 14336
+  max_position_embeddings: 32900
   rope_scaling:
     rope_type: default
   rope_theta: 500000
   num_key_value_heads: 8
   rms_norm_eps: 1.0e-05
   intermediate_size: 2816
+  max_position_embeddings: 262144
   rope_scaling:
     rope_type: default
   rope_theta: 100000
 special_token_dict: {}
 max_word_size: 100
 sliding_window: 768
+max_position_embeddings: 262144
 torch_dtype: bfloat16
 architectures:
 - HATDecoderForCausalLM

model.py CHANGED Viewed

@@ -26,12 +26,6 @@ from .transformer_backbone import (
     LlamaRotaryEmbedding,
 )
-try:
-    transformers_version = version("transformers")
-    if transformers_version != "4.46.3":
-        print(f"Warning: Expecected transformers version 4.46.3, but found {transformers_version}. Outputs might be different.")
-except PackageNotFoundError:
-    print("transformers is not installed")
 def sample_argmax(logits: torch.Tensor) -> torch.Tensor:
@@ -41,13 +35,12 @@ def sample_argmax(logits: torch.Tensor) -> torch.Tensor:
 LLAMA_TEMPLATE = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant. You give engaging, well-structured answers to user inquiries.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
-class HATCache(Cache):
     encoder_cache: DynamicCache
     backbone_cache: DynamicCache
     decoder_cache: DynamicCache
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
         self.encoder_cache = DynamicCache()
         self.backbone_cache = DynamicCache()
         self.decoder_cache = DynamicCache()
@@ -382,10 +375,7 @@ class HATCrossAttention(nn.Module):
         self.o_proj = nn.Linear(in_features=hidden_size, out_features=hidden_size_q, dtype=dtype, bias=False)
-        rope_theta = config.rope_theta
-        rope_type = config.rope_scaling["rope_type"]
-        self.rotary_emb = LlamaRotaryEmbedding(dim=self.head_dim, base=rope_theta, rope_type=rope_type)
     def forward(
         self,

     LlamaRotaryEmbedding,
 )
 def sample_argmax(logits: torch.Tensor) -> torch.Tensor:
 LLAMA_TEMPLATE = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant. You give engaging, well-structured answers to user inquiries.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+class HATCache:
     encoder_cache: DynamicCache
     backbone_cache: DynamicCache
     decoder_cache: DynamicCache
+    def __init__(self):
         self.encoder_cache = DynamicCache()
         self.backbone_cache = DynamicCache()
         self.decoder_cache = DynamicCache()
         self.o_proj = nn.Linear(in_features=hidden_size, out_features=hidden_size_q, dtype=dtype, bias=False)
+        self.rotary_emb = LlamaRotaryEmbedding(config=config)
     def forward(
         self,

transformer_backbone.py CHANGED Viewed

@@ -22,17 +22,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
-from typing import List, Optional, Tuple, Union
 import torch
 import torch.nn.functional as F
-import torch.utils.checkpoint
 from torch import nn
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache, StaticCache
 from transformers.generation import GenerationMixin
-from transformers.modeling_attn_mask_utils import AttentionMaskConverter
 from transformers.modeling_flash_attention_utils import _flash_attention_forward
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
@@ -86,117 +85,41 @@ ALL_LAYERNORM_LAYERS.append(LlamaRMSNorm)
 class LlamaRotaryEmbedding(nn.Module):
-    def __init__(
-        self,
-        dim=None,
-        max_position_embeddings=2048,
-        base=10000,
-        device=None,
-        scaling_factor=1.0,
-        rope_type="default",
-        config: Optional[LlamaConfig] = None,
-    ):
         super().__init__()
-        # TODO (joao): remove the `if` below, only used for BC
-        self.rope_kwargs = {}
-        if config is None:
-            logger.warning_once(
-                "`LlamaRotaryEmbedding` can now be fully parameterized by passing the model config through the "
-                "`config` argument. All other arguments will be removed in v4.46"
-            )
-            self.rope_kwargs = {
-                "rope_type": rope_type,
-                "factor": scaling_factor,
-                "dim": dim,
-                "base": base,
-                "max_position_embeddings": max_position_embeddings,
-            }
-            self.rope_type = rope_type
-            self.max_seq_len_cached = max_position_embeddings
-            self.original_max_seq_len = max_position_embeddings
         else:
-            # BC: "rope_type" was originally "type"
-            if config.rope_scaling is not None:
-                self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
-            else:
-                self.rope_type = "default"
-            self.max_seq_len_cached = config.max_position_embeddings
-            self.original_max_seq_len = config.max_position_embeddings
         self.config = config
         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(
-                self.config, device, seq_len=seq_len, **self.rope_kwargs
-            )
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
     @torch.no_grad()
     def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
         position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
-class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
-    """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
-    def __init__(self, *args, **kwargs):
-        logger.warning_once(
-            "`LlamaLinearScalingRotaryEmbedding` is deprecated an will be removed in v4.46. Please use "
-            "`LlamaRotaryEmbedding`, which now also does linear scaling (simply pass the model config to __init__)."
-        )
-        kwargs["rope_type"] = "linear"
-        super().__init__(*args, **kwargs)
-class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
-    """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
-    def __init__(self, *args, **kwargs):
-        logger.warning_once(
-            "`LlamaDynamicNTKScalingRotaryEmbedding` is deprecated an will be removed in v4.46. Please use "
-            "`LlamaRotaryEmbedding`, which now also does dynamic ntk scaling (simply pass the model config to "
-            "__init__)."
-        )
-        kwargs["rope_type"] = "dynamic"
-        super().__init__(*args, **kwargs)
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
     x1 = x[..., : x.shape[-1] // 2]
@@ -565,110 +488,7 @@ class LlamaFlashAttention2(LlamaAttention):
         return attn_output, attn_weights, past_key_value
-class LlamaSdpaAttention(LlamaAttention):
-    """
-    Llama attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
-    `LlamaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
-    SDPA API.
-    """
-    # Adapted from LlamaAttention.forward
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        cache_position: Optional[torch.LongTensor] = None,
-        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
-            logger.warning_once(
-                "LlamaModel is using LlamaSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
-                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-                cache_position=cache_position,
-                position_embeddings=position_embeddings,
-            )
-        bsz, q_len, _ = hidden_states.size()
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        if position_embeddings is None:
-            logger.warning_once(
-                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
-                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
-                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
-                "removed and `position_embeddings` will be mandatory."
-            )
-            cos, sin = self.rotary_emb(value_states, position_ids)
-        else:
-            cos, sin = position_embeddings
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
-        if past_key_value is not None:
-            # sin and cos are specific to RoPE models; cache_position needed for the static cache
-            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        causal_mask = attention_mask
-        if attention_mask is not None:
-            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
-        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
-        # Reference: https://github.com/pytorch/pytorch/issues/112577.
-        if query_states.device.type == "cuda" and causal_mask is not None:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
-        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
-        is_causal = True if causal_mask is None and q_len > 1 else False
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=causal_mask,
-            dropout_p=self.attention_dropout if self.training else 0.0,
-            is_causal=is_causal,
-        )
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.view(bsz, q_len, -1)
-        attn_output = self.o_proj(attn_output)
-        return attn_output, None, past_key_value
-LLAMA_ATTENTION_CLASSES = {
-    "eager": LlamaAttention,
-    "flash_attention_2": LlamaFlashAttention2,
-    "sdpa": LlamaSdpaAttention,
-}
 class LlamaDecoderLayer(nn.Module):
@@ -676,7 +496,7 @@ class LlamaDecoderLayer(nn.Module):
         super().__init__()
         self.hidden_size = config.hidden_size
-        self.self_attn = LLAMA_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
         self.mlp = LlamaMLP(config)
         self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -960,9 +780,8 @@ class LlamaModel(LlamaPreTrainedModel):
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
-        causal_mask = self._update_causal_mask(
-            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
-        )
         hidden_states = inputs_embeds
         # create position embeddings to be shared across the decoder layers
@@ -1028,70 +847,6 @@ class LlamaModel(LlamaPreTrainedModel):
             attentions=all_self_attns,
         )
-    def _update_causal_mask(
-        self,
-        attention_mask: torch.Tensor,
-        input_tensor: torch.Tensor,
-        cache_position: torch.Tensor,
-        past_key_values: Cache,
-        output_attentions: bool,
-    ):
-        if self.config._attn_implementation == "flash_attention_2":
-            if attention_mask is not None and 0.0 in attention_mask:
-                return attention_mask
-            return None
-        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
-        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
-        # to infer the attention mask.
-        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
-        using_static_cache = isinstance(past_key_values, StaticCache)
-        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
-        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
-            if AttentionMaskConverter._ignore_causal_mask_sdpa(
-                attention_mask,
-                inputs_embeds=input_tensor,
-                past_key_values_length=past_seen_tokens,
-                is_training=self.training,
-            ):
-                return None
-        dtype, device = input_tensor.dtype, input_tensor.device
-        sequence_length = input_tensor.shape[1]
-        if using_static_cache:
-            target_length = past_key_values.get_max_cache_shape()
-        else:
-            target_length = (
-                attention_mask.shape[-1]
-                if isinstance(attention_mask, torch.Tensor)
-                else past_seen_tokens + sequence_length + 1
-            )
-        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
-        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
-            attention_mask,
-            sequence_length=sequence_length,
-            target_length=target_length,
-            dtype=dtype,
-            device=device,
-            cache_position=cache_position,
-            batch_size=input_tensor.shape[0],
-        )
-        if (
-            self.config._attn_implementation == "sdpa"
-            and attention_mask is not None
-            and attention_mask.device.type == "cuda"
-            and not output_attentions
-        ):
-            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
-            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
-            # Details: https://github.com/pytorch/pytorch/issues/110213
-            min_dtype = torch.finfo(dtype).min
-            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
-        return causal_mask
     @staticmethod
     def _prepare_4d_causal_attention_mask_with_cache_position(

 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
+from typing import Callable, List, Optional, Tuple, Union
 import torch
 import torch.nn.functional as F
 from torch import nn
+from transformers import PretrainedConfig, dynamic_rope_update
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache, StaticCache
 from transformers.generation import GenerationMixin
 from transformers.modeling_flash_attention_utils import _flash_attention_forward
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
 class LlamaRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+    def __init__(self, config: LlamaConfig, device=None):
         super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
         else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
         self.config = config
         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
     @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
         position_ids_expanded = position_ids[:, None, :].float()
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
     x1 = x[..., : x.shape[-1] // 2]
         return attn_output, attn_weights, past_key_value
 class LlamaDecoderLayer(nn.Module):
         super().__init__()
         self.hidden_size = config.hidden_size
+        self.self_attn = LlamaFlashAttention2(config=config, layer_idx=layer_idx)
         self.mlp = LlamaMLP(config)
         self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
+        # Not needed for Flash Attention 2
+        causal_mask = None
         hidden_states = inputs_embeds
         # create position embeddings to be shared across the decoder layers
             attentions=all_self_attns,
         )
     @staticmethod
     def _prepare_4d_causal_attention_mask_with_cache_position(