sdadas
/

stella-pl-retrieval

@@ -274,7 +274,9 @@ class Qwen2Attention(nn.Module):
                     "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                     "with a layer index."
                 )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
@@ -378,7 +380,9 @@ class Qwen2FlashAttention2(Qwen2Attention):
                     "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                     "with a layer index."
                 )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
         # Because the input can be padded, the absolute sequence length depends on the max position id.
         rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
@@ -676,7 +680,9 @@ class Qwen2SdpaAttention(Qwen2Attention):
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
@@ -993,12 +999,28 @@ class Qwen2Model(Qwen2PreTrainedModel):
                 use_cache = False
         past_key_values_length = 0
         if use_cache:
-            use_legacy_cache = not isinstance(past_key_values, Cache)
-            if use_legacy_cache:
                 past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            past_key_values_length = past_key_values.get_usable_length(seq_length)
         if position_ids is None:
             device = input_ids.device if input_ids is not None else inputs_embeds.device
@@ -1104,7 +1126,11 @@ class Qwen2Model(Qwen2PreTrainedModel):
         next_cache = None
         if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
         if not return_dict:
             return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
@@ -1243,10 +1269,21 @@ class Qwen2ForCausalLM(Qwen2PreTrainedModel):
         # Omit tokens covered by past_key_values
         if past_key_values is not None:
             if isinstance(past_key_values, Cache):
                 cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-                max_cache_length = past_key_values.get_max_length()
             else:
                 cache_length = past_length = past_key_values[0][0].shape[2]
                 max_cache_length = None
@@ -1287,7 +1324,7 @@ class Qwen2ForCausalLM(Qwen2PreTrainedModel):
         model_inputs.update(
             {
                 "position_ids": position_ids,
-                "past_key_values": past_key_values,
                 "use_cache": kwargs.get("use_cache"),
                 "attention_mask": attention_mask,
             }

                     "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                     "with a layer index."
                 )
+            # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+            past_len = past_key_value.get_seq_length(self.layer_idx) if past_key_value is not None else 0
+            kv_seq_len += past_len
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
                     "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                     "with a layer index."
                 )
+            # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+            past_len = past_key_value.get_seq_length(self.layer_idx) if past_key_value is not None else 0
+            kv_seq_len += past_len
         # Because the input can be padded, the absolute sequence length depends on the max position id.
         rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
+            # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+            past_len = past_key_value.get_seq_length(self.layer_idx) if past_key_value is not None else 0
+            kv_seq_len += past_len
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
                 use_cache = False
         past_key_values_length = 0
+        use_legacy_cache = False
         if use_cache:
+            # OLD behavior (removed in HF >= 4.55): treat anything not Cache as "legacy" but then
+            # directly used legacy methods on it (would crash if None or new API).
+            # use_legacy_cache = not isinstance(past_key_values, Cache)
+            # if use_legacy_cache:
+            #     # past_key_values_length = past_key_values.get_seq_length()
+            #     past_key_values_length = past_key_values.get_usable_length(seq_length)
+            # NEW behavior: if a legacy tuple is passed, convert it to the new Cache API,
+            # compute length via .get_seq_length(), and remember to return legacy if that’s what came in.
+            if past_key_values is not None and not isinstance(past_key_values, Cache):
+                use_legacy_cache = True  # remember input format for return
                 past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            if isinstance(past_key_values, Cache):
+                # Layer-agnostic total length; cache_position is handled deeper if needed
+                past_key_values_length = past_key_values.get_seq_length()
+            else:
+                # No cache given on first forward, keep length at 0
+                past_key_values_length = 0
         if position_ids is None:
             device = input_ids.device if input_ids is not None else inputs_embeds.device
         next_cache = None
         if use_cache:
+            # If the caller passed legacy, return legacy. Otherwise return the Cache object.
+            next_cache = (
+                next_decoder_cache.to_legacy_cache() if
+                (use_legacy_cache and next_decoder_cache is not None) else next_decoder_cache
+            )
         if not return_dict:
             return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
         # Omit tokens covered by past_key_values
         if past_key_values is not None:
             if isinstance(past_key_values, Cache):
+                # NEW API (HF >= 4.55): use Cache methods
                 cache_length = past_key_values.get_seq_length()
+                past_length = cache_length  # `seen_tokens` removed; use total seq length instead
+                try:
+                    max_cache_length = past_key_values.get_max_cache_shape()
+                except Exception:
+                    max_cache_length = None
+                # OLD API (deprecated/removed):
+                # cache_length = past_key_values.get_seq_length()
+                # past_length = past_key_values.seen_tokens
+                # max_cache_length = past_key_values.get_max_length()
             else:
+                # Legacy tuple format: keep computing lengths directly from tensors
+                # (We keep it compatible without forcing a conversion here)
                 cache_length = past_length = past_key_values[0][0].shape[2]
                 max_cache_length = None
         model_inputs.update(
             {
                 "position_ids": position_ids,
+                "past_key_values": past_key_values,  # pass through unchanged (legacy or new Cache object)
                 "use_cache": kwargs.get("use_cache"),
                 "attention_mask": attention_mask,
             }