Spaces:

Heartsync
/

phoenix

Paused

App Files Files Community

seawolf2357 commited on Nov 6

Commit

c9f844d

verified ·

1 Parent(s): ad81847

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -12

app.py CHANGED Viewed

@@ -80,6 +80,10 @@ class MultiScaleRetention(nn.Module):
         self.kv_head_dim = self.head_dim  # Same as Q head_dim
         self.kv_dim = self.num_key_value_heads * self.kv_head_dim
         print(f"  📐 Layer {layer_idx} Retention (GQA) initialized:")
         print(f"     - hidden_size: {self.hidden_size}")
         print(f"     - num_heads (Q): {self.num_heads}")
@@ -117,6 +121,11 @@ class MultiScaleRetention(nn.Module):
             batch, num_key_value_heads, n_rep, slen, head_dim
         )
         return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
     def forward(
         self,
@@ -163,11 +172,17 @@ class MultiScaleRetention(nn.Module):
         # Now all have shape [B, num_heads, L, head_dim]
-        # Retention computation
         retention_states, new_state = self._compute_retention(
-            query_states, key_states, value_states, past_key_value
         )
         # Reshape back: [B, num_heads, L, head_dim] -> [B, L, hidden_size]
         retention_states = retention_states.transpose(1, 2).contiguous()
         retention_states = retention_states.reshape(
@@ -187,11 +202,11 @@ class MultiScaleRetention(nn.Module):
         # Output projection
         attn_output = self.o_proj(retention_states)
-        # ✅ Return output and state for KV cache - Always return 3 values
-        if use_cache:
-            return (attn_output, None, new_state)
-        else:
-            return (attn_output, None, None)
     def _compute_retention(
         self,
@@ -354,11 +369,9 @@ class HierarchicalRetention(nn.Module):
         output = torch.stack(hierarchical_outputs, dim=1)
         output = self.norm(output)
-        # ✅ Return with state for KV cache - Always return 3 values
-        if use_cache and new_state is not None:
-            return (output, None, new_state)
-        else:
-            return (output, None, None)
 # =====================================================
@@ -736,6 +749,15 @@ def generate_text_phoenix(
         print(f"✅ Converted {converted}/{total} layers")
         # 3. Tokenizer 로드
         try:
             tokenizer = AutoTokenizer.from_pretrained(model_url, trust_remote_code=True)

         self.kv_head_dim = self.head_dim  # Same as Q head_dim
         self.kv_dim = self.num_key_value_heads * self.kv_head_dim
+        # ✅ Internal state storage for KV cache simulation
+        self.register_buffer('_internal_state', None, persistent=False)
+        self.register_buffer('_state_initialized', torch.tensor(False), persistent=False)
         print(f"  📐 Layer {layer_idx} Retention (GQA) initialized:")
         print(f"     - hidden_size: {self.hidden_size}")
         print(f"     - num_heads (Q): {self.num_heads}")
             batch, num_key_value_heads, n_rep, slen, head_dim
         )
         return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+    def reset_state(self):
+        """Reset internal state (call at start of new sequence)"""
+        self._internal_state = None
+        self._state_initialized = torch.tensor(False)
     def forward(
         self,
         # Now all have shape [B, num_heads, L, head_dim]
+        # Retention computation with internal state
+        past_state = self._internal_state if (use_cache and self._state_initialized) else None
         retention_states, new_state = self._compute_retention(
+            query_states, key_states, value_states, past_state
         )
+        # ✅ Store state internally for next iteration
+        if use_cache:
+            self._internal_state = new_state.detach()
+            self._state_initialized = torch.tensor(True)
         # Reshape back: [B, num_heads, L, head_dim] -> [B, L, hidden_size]
         retention_states = retention_states.transpose(1, 2).contiguous()
         retention_states = retention_states.reshape(
         # Output projection
         attn_output = self.o_proj(retention_states)
+        # ✅ Return format for compatibility
+        # Granite expects: (hidden_states, attn_weights)
+        # We return: (output, None) - no past_key_values in return signature
+        # State is stored internally but not returned
+        return (attn_output, None)
     def _compute_retention(
         self,
         output = torch.stack(hierarchical_outputs, dim=1)
         output = self.norm(output)
+        # ✅ Return format for compatibility with Granite
+        # Granite expects: (hidden_states, attn_weights)
+        return (output, None)
 # =====================================================
         print(f"✅ Converted {converted}/{total} layers")
+        # ✅ Reset all retention states before generation
+        print(f"🔄 Resetting retention states...")
+        for layer in model.model.layers:
+            if hasattr(layer, 'self_attn') and hasattr(layer.self_attn, 'reset_state'):
+                layer.self_attn.reset_state()
+            elif hasattr(layer, 'self_attn') and hasattr(layer.self_attn, 'base_retention'):
+                if hasattr(layer.self_attn.base_retention, 'reset_state'):
+                    layer.self_attn.base_retention.reset_state()
         # 3. Tokenizer 로드
         try:
             tokenizer = AutoTokenizer.from_pretrained(model_url, trust_remote_code=True)