Spaces:

Heartsync
/

phoenix

Paused

App Files Files Community

seawolf2357 commited on Nov 6

Commit

83d9107

verified ·

1 Parent(s): 068c039

Update app.py

Browse files

Files changed (1) hide show

app.py +101 -48

app.py CHANGED Viewed

@@ -1,10 +1,12 @@
 """
 🔮 PHOENIX Retention Research Platform
-Real Implementation - Attention Replacement
 L40S GPU + Persistent Storage (SQLite + ChromaDB)
 Base Model: IBM Granite 4.0 H 350M (Attention → Retention)
 VIDraft AI Research Lab
 """
 import gradio as gr
@@ -45,13 +47,15 @@ print(f"💾 Storage: {STORAGE_PATH}")
 print(f"🎯 Default Base Model: {DEFAULT_MODEL}")
 # =====================================================
-# PHOENIX Retention Attention (핵심!)
 # =====================================================
 class MultiScaleRetention(nn.Module):
     """
     진짜 Retention Attention
     Transformer의 Self-Attention을 완전히 교체
     """
     def __init__(self, config, layer_idx=0):
@@ -109,6 +113,7 @@ class MultiScaleRetention(nn.Module):
     ):
         """
         O(n) 복잡도 Retention 메커니즘
         """
         batch_size, seq_len, input_dim = hidden_states.shape
@@ -123,46 +128,72 @@ class MultiScaleRetention(nn.Module):
             past_key_value = past_key_values
         # Q, K, V 계산
-        query_states = self.q_proj(hidden_states)  # [B, L, H]
-        key_states = self.k_proj(hidden_states)    # [B, L, H]
-        value_states = self.v_proj(hidden_states)  # [B, L, H]
-        # ✅ Projection 후 크기 확인
-        assert query_states.shape[-1] == self.hidden_size, \
-            f"Q projection output is {query_states.shape[-1]}, expected {self.hidden_size}"
-        # ✅ Multi-head reshape
-        # [B, L, H] -> [B, L, num_heads, head_dim] -> [B, num_heads, L, head_dim]
         query_states = query_states.view(
-            batch_size, seq_len, self.num_heads, self.head_dim
         ).transpose(1, 2)
         key_states = key_states.view(
-            batch_size, seq_len, self.num_heads, self.head_dim
         ).transpose(1, 2)
         value_states = value_states.view(
-            batch_size, seq_len, self.num_heads, self.head_dim
         ).transpose(1, 2)
         # Retention 계산
         retention_states = self._compute_retention(
-            query_states, key_states, value_states, past_key_value
         )
-        # Reshape back: [B, num_heads, L, head_dim] -> [B, L, H]
         retention_states = retention_states.transpose(1, 2).contiguous()
         retention_states = retention_states.reshape(
-            batch_size, seq_len, self.hidden_size
         )
-        # Group norm
-        retention_states = self.group_norm(
-            retention_states.transpose(1, 2)
-        ).transpose(1, 2)
         # Output projection
-        attn_output = self.o_proj(retention_states)
         return (attn_output, None, past_key_value)
@@ -171,10 +202,15 @@ class MultiScaleRetention(nn.Module):
         queries: torch.Tensor,  # [B, H, L, D]
         keys: torch.Tensor,     # [B, H, L, D]
         values: torch.Tensor,   # [B, H, L, D]
-        past_state: Optional[Tuple] = None
     ):
         """O(n) Retention 계산"""
-        batch_size, num_heads, seq_len, head_dim = queries.shape
         # State 초기화
         if past_state is not None:
@@ -301,12 +337,13 @@ class HierarchicalRetention(nn.Module):
 # =====================================================
-# 모델 변환 함수
 # =====================================================
 def replace_attention_with_retention(model, use_hierarchical=True):
     """
     Transformer의 Attention을 PHOENIX Retention으로 교체
     """
     print("🔄 Starting Attention → Retention conversion...")
@@ -353,24 +390,37 @@ def replace_attention_with_retention(model, use_hierarchical=True):
                 else:
                     new_retention = MultiScaleRetention(model.config, layer_idx)
-                # ✅ 가중치 복사
                 if hasattr(old_attn, 'q_proj'):
-                    # Shape 확인
-                    if (old_attn.q_proj.weight.shape ==
-                        new_retention.base_retention.q_proj.weight.shape):
-                        new_retention.base_retention.q_proj.weight.data = \
-                            old_attn.q_proj.weight.data.clone()
-                        new_retention.base_retention.k_proj.weight.data = \
-                            old_attn.k_proj.weight.data.clone()
-                        new_retention.base_retention.v_proj.weight.data = \
-                            old_attn.v_proj.weight.data.clone()
-                        new_retention.base_retention.o_proj.weight.data = \
-                            old_attn.o_proj.weight.data.clone()
-                        print(f"  ✅ Layer {layer_idx}: Weights copied")
-                    else:
-                        print(f"  ⚠️ Layer {layer_idx}: Shape mismatch, random init")
                 # 교체
                 layer.self_attn = new_retention
@@ -1020,7 +1070,7 @@ def get_database_statistics():
 # =====================================================
 with gr.Blocks(
-    title="🔮 PHOENIX Retention Research Platform - Real Implementation",
     theme=gr.themes.Soft(),
 ) as demo:
@@ -1029,9 +1079,12 @@ with gr.Blocks(
     **Post-Hierarchical Optimized Efficient Neural Infinite-conteXt**
-    ## 🔥 진짜 PHOENIX - Attention → Retention 완전 교체
-    이 버전은 Transformer의 Self-Attention을 PHOENIX Retention으로 **실제로 교체**합니다.
     ---
     """)
@@ -1175,14 +1228,14 @@ with gr.Blocks(
     ## 🔥 PHOENIX 핵심 차이점
     ### 이전 버전 (가짜)
-```
     입력 → Granite Attention (O(n²)) → PHOENIX 후처리 → 출력
-```
     ### 현재 버전 (진짜)
-```
     입력 → PHOENIX Retention (O(n)) → 출력
-```
     ## ⏱️ 예상 변환 시간 (350M 모델)
@@ -1196,7 +1249,7 @@ with gr.Blocks(
     - `Qwen/Qwen2.5-0.5B` (500M)
     - `meta-llama/Llama-3.2-1B` (1B)
-    **VIDraft AI Research Lab** | Real PHOENIX Implementation 🔥
     """)
 if __name__ == "__main__":
@@ -1205,4 +1258,4 @@ if __name__ == "__main__":
         server_name="0.0.0.0",
         server_port=7860,
         share=False
-    )

 """
 🔮 PHOENIX Retention Research Platform
+Real Implementation - Attention Replacement (FIXED)
 L40S GPU + Persistent Storage (SQLite + ChromaDB)
 Base Model: IBM Granite 4.0 H 350M (Attention → Retention)
 VIDraft AI Research Lab
+✅ FIX: Shape mismatch 문제 해결
 """
 import gradio as gr
 print(f"🎯 Default Base Model: {DEFAULT_MODEL}")
 # =====================================================
+# PHOENIX Retention Attention (핵심! - FIXED)
 # =====================================================
 class MultiScaleRetention(nn.Module):
     """
     진짜 Retention Attention
     Transformer의 Self-Attention을 완전히 교체
+    ✅ FIX: Adaptive dimension handling
     """
     def __init__(self, config, layer_idx=0):
     ):
         """
         O(n) 복잡도 Retention 메커니즘
+        ✅ FIX: Adaptive dimension handling
         """
         batch_size, seq_len, input_dim = hidden_states.shape
             past_key_value = past_key_values
         # Q, K, V 계산
+        query_states = self.q_proj(hidden_states)  # [B, L, ?]
+        key_states = self.k_proj(hidden_states)    # [B, L, ?]
+        value_states = self.v_proj(hidden_states)  # [B, L, ?]
+        # ✅ 실제 projection output 차원 확인
+        actual_proj_dim = query_states.shape[-1]
+        if actual_proj_dim != self.hidden_size:
+            print(f"  ⚠️ Layer {self.layer_idx} Projection dim mismatch:")
+            print(f"     Expected: {self.hidden_size}, Got: {actual_proj_dim}")
+            # Adaptive head_dim 계산
+            if actual_proj_dim % self.num_heads != 0:
+                raise ValueError(
+                    f"Projection output {actual_proj_dim} not divisible by "
+                    f"num_heads {self.num_heads}"
+                )
+            adaptive_head_dim = actual_proj_dim // self.num_heads
+            print(f"     🔧 Using adaptive head_dim: {adaptive_head_dim}")
+        else:
+            adaptive_head_dim = self.head_dim
+        # ✅ Multi-head reshape (adaptive)
+        # [B, L, actual_proj_dim] -> [B, L, num_heads, head_dim] -> [B, num_heads, L, head_dim]
         query_states = query_states.view(
+            batch_size, seq_len, self.num_heads, adaptive_head_dim
         ).transpose(1, 2)
         key_states = key_states.view(
+            batch_size, seq_len, self.num_heads, adaptive_head_dim
         ).transpose(1, 2)
         value_states = value_states.view(
+            batch_size, seq_len, self.num_heads, adaptive_head_dim
         ).transpose(1, 2)
         # Retention 계산
         retention_states = self._compute_retention(
+            query_states, key_states, value_states, past_key_value,
+            adaptive_head_dim
         )
+        # Reshape back: [B, num_heads, L, head_dim] -> [B, L, actual_proj_dim]
         retention_states = retention_states.transpose(1, 2).contiguous()
         retention_states = retention_states.reshape(
+            batch_size, seq_len, actual_proj_dim
         )
+        # ✅ Group norm (actual_proj_dim 사용)
+        if actual_proj_dim == self.hidden_size:
+            retention_states = self.group_norm(
+                retention_states.transpose(1, 2)
+            ).transpose(1, 2)
+        else:
+            # Adaptive normalization
+            norm = nn.GroupNorm(self.num_heads, actual_proj_dim).to(retention_states.device)
+            retention_states = norm(retention_states.transpose(1, 2)).transpose(1, 2)
         # Output projection
+        # ✅ actual_proj_dim -> hidden_size 변환 필요
+        if actual_proj_dim != self.hidden_size:
+            # Adaptive projection
+            adaptive_o_proj = nn.Linear(actual_proj_dim, self.hidden_size, bias=False).to(retention_states.device)
+            attn_output = adaptive_o_proj(retention_states)
+        else:
+            attn_output = self.o_proj(retention_states)
         return (attn_output, None, past_key_value)
         queries: torch.Tensor,  # [B, H, L, D]
         keys: torch.Tensor,     # [B, H, L, D]
         values: torch.Tensor,   # [B, H, L, D]
+        past_state: Optional[Tuple] = None,
+        head_dim: Optional[int] = None
     ):
         """O(n) Retention 계산"""
+        batch_size, num_heads, seq_len, actual_head_dim = queries.shape
+        # ✅ Use provided head_dim or infer from queries
+        if head_dim is None:
+            head_dim = actual_head_dim
         # State 초기화
         if past_state is not None:
 # =====================================================
+# 모델 변환 함수 (FIXED)
 # =====================================================
 def replace_attention_with_retention(model, use_hierarchical=True):
     """
     Transformer의 Attention을 PHOENIX Retention으로 교체
+    ✅ FIX: Better weight copying and dimension handling
     """
     print("🔄 Starting Attention → Retention conversion...")
                 else:
                     new_retention = MultiScaleRetention(model.config, layer_idx)
+                # ✅ 가중치 복사 (improved)
                 if hasattr(old_attn, 'q_proj'):
+                    try:
+                        # Get target retention module
+                        if use_hierarchical:
+                            target_retention = new_retention.base_retention
+                        else:
+                            target_retention = new_retention
+                        # Shape 확인 및 복사
+                        old_q_shape = old_attn.q_proj.weight.shape
+                        new_q_shape = target_retention.q_proj.weight.shape
+                        if old_q_shape == new_q_shape:
+                            target_retention.q_proj.weight.data = \
+                                old_attn.q_proj.weight.data.clone()
+                            target_retention.k_proj.weight.data = \
+                                old_attn.k_proj.weight.data.clone()
+                            target_retention.v_proj.weight.data = \
+                                old_attn.v_proj.weight.data.clone()
+                            target_retention.o_proj.weight.data = \
+                                old_attn.o_proj.weight.data.clone()
+                            print(f"  ✅ Layer {layer_idx}: Weights copied (shape: {old_q_shape})")
+                        else:
+                            print(f"  ⚠️ Layer {layer_idx}: Shape mismatch")
+                            print(f"     Old: {old_q_shape}, New: {new_q_shape}")
+                            print(f"     Using random initialization")
+                    except Exception as e:
+                        print(f"  ⚠️ Layer {layer_idx}: Weight copy failed - {e}")
                 # 교체
                 layer.self_attn = new_retention
 # =====================================================
 with gr.Blocks(
+    title="🔮 PHOENIX Retention Research Platform - Real Implementation (FIXED)",
     theme=gr.themes.Soft(),
 ) as demo:
     **Post-Hierarchical Optimized Efficient Neural Infinite-conteXt**
+    ## 🔥 진짜 PHOENIX - Attention → Retention 완전 교체 (FIXED)
+    ✅ **FIX**: Shape mismatch 문제 해결
+    - Adaptive dimension handling
+    - Better weight copying
+    - Dynamic projection adjustment
     ---
     """)
     ## 🔥 PHOENIX 핵심 차이점
     ### 이전 버전 (가짜)
+    ```
     입력 → Granite Attention (O(n²)) → PHOENIX 후처리 → 출력
+    ```
     ### 현재 버전 (진짜)
+    ```
     입력 → PHOENIX Retention (O(n)) → 출력
+    ```
     ## ⏱️ 예상 변환 시간 (350M 모델)
     - `Qwen/Qwen2.5-0.5B` (500M)
     - `meta-llama/Llama-3.2-1B` (1B)
+    **VIDraft AI Research Lab** | Real PHOENIX Implementation 🔥 (FIXED)
     """)
 if __name__ == "__main__":
         server_name="0.0.0.0",
         server_port=7860,
         share=False
+    )