Spaces:

Heartsync
/

phoenix

Paused

App Files Files Community

seawolf2357 commited on Nov 6

Commit

2c0487e

verified ·

1 Parent(s): d655ec6

Update app.py

Browse files

Files changed (1) hide show

app.py +102 -103

app.py CHANGED Viewed

@@ -61,27 +61,26 @@ class MultiScaleRetention(nn.Module):
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
-        # ✅ Head dimension 안전하게 계산
-        if hasattr(config, 'head_dim'):
-            self.head_dim = config.head_dim
-        else:
-            self.head_dim = self.hidden_size // self.num_heads
-        # ✅ 나누어떨어지지 않는 경우 처리
         if self.hidden_size % self.num_heads != 0:
-            print(f"  ⚠️ Layer {layer_idx}: hidden_size ({self.hidden_size}) not divisible by num_heads ({self.num_heads})")
-            # 가장 가까운 배수로 조정
-            self.head_dim = (self.hidden_size + self.num_heads - 1) // self.num_heads
-            self.effective_hidden = self.head_dim * self.num_heads
-            print(f"     Adjusted: head_dim={self.head_dim}, effective_hidden={self.effective_hidden}")
-        else:
-            self.effective_hidden = self.hidden_size
-        # Q, K, V projections
-        self.q_proj = nn.Linear(self.hidden_size, self.effective_hidden, bias=False)
-        self.k_proj = nn.Linear(self.hidden_size, self.effective_hidden, bias=False)
-        self.v_proj = nn.Linear(self.hidden_size, self.effective_hidden, bias=False)
-        self.o_proj = nn.Linear(self.effective_hidden, self.hidden_size, bias=False)
         # Retention 특화 파라미터
         decay_values = torch.linspace(0.8, 0.95, self.num_heads)
@@ -90,7 +89,7 @@ class MultiScaleRetention(nn.Module):
         # Group normalization
         self.group_norm = nn.GroupNorm(
             num_groups=self.num_heads,
-            num_channels=self.effective_hidden
         )
     def forward(
@@ -118,21 +117,42 @@ class MultiScaleRetention(nn.Module):
         key_states = self.k_proj(hidden_states)
         value_states = self.v_proj(hidden_states)
-        # ✅ Multi-head reshape (안전하게)
         try:
             query_states = query_states.view(
                 batch_size, seq_len, self.num_heads, self.head_dim
-            ).transpose(1, 2)
             key_states = key_states.view(
                 batch_size, seq_len, self.num_heads, self.head_dim
             ).transpose(1, 2)
             value_states = value_states.view(
                 batch_size, seq_len, self.num_heads, self.head_dim
             ).transpose(1, 2)
         except RuntimeError as e:
-            print(f"  ⚠️ Reshape error: {e}")
-            print(f"     query_states shape: {query_states.shape}")
-            print(f"     Expected: [B={batch_size}, L={seq_len}, H={self.num_heads}, D={self.head_dim}]")
             raise
         # Retention 계산
@@ -144,7 +164,7 @@ class MultiScaleRetention(nn.Module):
         # Reshape back
         retention_states = retention_states.transpose(1, 2).contiguous()
         retention_states = retention_states.reshape(
-            batch_size, seq_len, self.effective_hidden
         )
         # Group norm
@@ -159,14 +179,19 @@ class MultiScaleRetention(nn.Module):
     def _compute_retention(
         self,
-        queries: torch.Tensor,
-        keys: torch.Tensor,
-        values: torch.Tensor,
         past_state: Optional[Tuple] = None
     ):
         """O(n) Retention 계산"""
         batch_size, num_heads, seq_len, head_dim = queries.shape
         # State 초기화
         if past_state is not None:
             state = past_state
@@ -180,22 +205,24 @@ class MultiScaleRetention(nn.Module):
         # 순차 처리 (O(n))
         for t in range(seq_len):
-            q_t = queries[:, :, t, :]
-            k_t = keys[:, :, t, :]
-            v_t = values[:, :, t, :]
             # Decay 적용
             decay = torch.sigmoid(self.decay).view(1, -1, 1, 1)
             state = decay * state
-            # State 업데이트
             state = state + torch.einsum('bhd,bhe->bhde', k_t, v_t)
-            # Output
             output_t = torch.einsum('bhd,bhde->bhe', q_t, state)
             outputs.append(output_t)
-        output = torch.stack(outputs, dim=2)
         return output
@@ -327,41 +354,55 @@ def replace_attention_with_retention(model, use_hierarchical=True):
                 old_attn = layer.self_attn
                 config = model.config
-                # ✅ 실제 hidden_size 확인
-                print(f"  📐 Layer {layer_idx} config:")
-                print(f"     - hidden_size: {config.hidden_size}")
-                print(f"     - num_attention_heads: {config.num_attention_heads}")
                 # ✅ 실제 가중치 shape 확인
                 if hasattr(old_attn, 'q_proj'):
-                    actual_hidden_size = old_attn.q_proj.weight.shape[0]
-                    actual_input_size = old_attn.q_proj.weight.shape[1]
-                    print(f"     - Actual Q proj: {old_attn.q_proj.weight.shape}")
-                    print(f"     - Actual hidden: {actual_hidden_size}")
-                    # ✅ Config 업데이트
-                    if actual_hidden_size != config.hidden_size:
-                        print(f"  ⚠️ Config mismatch! Using actual size: {actual_hidden_size}")
-                        # 임시 config 생성
-                        temp_config = type('Config', (), {})()
-                        temp_config.hidden_size = actual_hidden_size
-                        temp_config.num_attention_heads = config.num_attention_heads
-                        # Head dimension 재계산
-                        temp_config.head_dim = actual_hidden_size // config.num_attention_heads
-                        config = temp_config
-                # PHOENIX Retention으로 교체
                 if use_hierarchical:
                     new_retention = HierarchicalRetention(config, layer_idx)
                 else:
                     new_retention = MultiScaleRetention(config, layer_idx)
-                # ✅ 가중치 복사 (shape 체크 추가)
                 if hasattr(old_attn, 'q_proj'):
-                    # Shape 확인 후 복사
-                    if old_attn.q_proj.weight.shape == new_retention.base_retention.q_proj.weight.shape:
                         new_retention.base_retention.q_proj.weight.data = \
                             old_attn.q_proj.weight.data.clone()
                         new_retention.base_retention.k_proj.weight.data = \
@@ -370,11 +411,10 @@ def replace_attention_with_retention(model, use_hierarchical=True):
                             old_attn.v_proj.weight.data.clone()
                         new_retention.base_retention.o_proj.weight.data = \
                             old_attn.o_proj.weight.data.clone()
-                        print(f"  ✅ Layer {layer_idx}: Weights copied")
                     else:
-                        print(f"  ⚠️ Layer {layer_idx}: Shape mismatch, using random init")
-                        print(f"     Old: {old_attn.q_proj.weight.shape}")
-                        print(f"     New: {new_retention.base_retention.q_proj.weight.shape}")
                 # 교체
                 layer.self_attn = new_retention
@@ -382,50 +422,9 @@ def replace_attention_with_retention(model, use_hierarchical=True):
                 print(f"  ✅ Layer {layer_idx}: Attention → Retention")
-            elif hasattr(layer, 'attn'):
-                # Alternative structure
-                old_attn = layer.attn
-                config = model.config
-                # ✅ 실제 크기 확인
-                if hasattr(old_attn, 'c_attn'):
-                    actual_size = old_attn.c_attn.weight.shape[0] // 3
-                    print(f"  📐 Layer {layer_idx} actual hidden_size: {actual_size}")
-                    if actual_size != config.hidden_size:
-                        temp_config = type('Config', (), {})()
-                        temp_config.hidden_size = actual_size
-                        temp_config.num_attention_heads = config.num_attention_heads
-                        config = temp_config
-                if use_hierarchical:
-                    new_retention = HierarchicalRetention(config, layer_idx)
-                else:
-                    new_retention = MultiScaleRetention(config, layer_idx)
-                # 가중치 복사
-                if hasattr(old_attn, 'c_attn'):
-                    qkv_weight = old_attn.c_attn.weight.data
-                    hidden_size = config.hidden_size
-                    new_retention.base_retention.q_proj.weight.data = \
-                        qkv_weight[:hidden_size, :].clone()
-                    new_retention.base_retention.k_proj.weight.data = \
-                        qkv_weight[hidden_size:2*hidden_size, :].clone()
-                    new_retention.base_retention.v_proj.weight.data = \
-                        qkv_weight[2*hidden_size:, :].clone()
-                    if hasattr(old_attn, 'c_proj'):
-                        new_retention.base_retention.o_proj.weight.data = \
-                            old_attn.c_proj.weight.data.clone()
-                layer.attn = new_retention
-                replaced_count += 1
-                print(f"  ✅ Layer {layer_idx}: Attention → Retention")
         except Exception as e:
-            print(f"  ⚠️ Layer {layer_idx}: Conversion failed - {e}")
             import traceback
             traceback.print_exc()
             continue

         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
+        # ✅ Head dimension 정확하게 계산
+        self.head_dim = self.hidden_size // self.num_heads
+        # ✅ 나누어떨어지는지 확인
         if self.hidden_size % self.num_heads != 0:
+            raise ValueError(
+                f"hidden_size ({self.hidden_size}) must be divisible by "
+                f"num_attention_heads ({self.num_heads})"
+            )
+        print(f"  📐 Layer {layer_idx} Retention config:")
+        print(f"     - hidden_size: {self.hidden_size}")
+        print(f"     - num_heads: {self.num_heads}")
+        print(f"     - head_dim: {self.head_dim}")
+        # Q, K, V projections (hidden_size → hidden_size)
+        self.q_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+        self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
         # Retention 특화 파라미터
         decay_values = torch.linspace(0.8, 0.95, self.num_heads)
         # Group normalization
         self.group_norm = nn.GroupNorm(
             num_groups=self.num_heads,
+            num_channels=self.hidden_size
         )
     def forward(
         key_states = self.k_proj(hidden_states)
         value_states = self.v_proj(hidden_states)
+        # ✅ Shape 디버깅
+        print(f"\n  🔍 Retention forward shapes:")
+        print(f"     - Input hidden_states: {hidden_states.shape}")
+        print(f"     - After projection Q: {query_states.shape}")
+        print(f"     - Expected reshape: [{batch_size}, {seq_len}, {self.num_heads}, {self.head_dim}]")
+        # ✅ Multi-head reshape - 정확한 차원으로
         try:
             query_states = query_states.view(
                 batch_size, seq_len, self.num_heads, self.head_dim
+            ).transpose(1, 2)  # [B, H, L, D]
             key_states = key_states.view(
                 batch_size, seq_len, self.num_heads, self.head_dim
             ).transpose(1, 2)
             value_states = value_states.view(
                 batch_size, seq_len, self.num_heads, self.head_dim
             ).transpose(1, 2)
+            print(f"     - After reshape Q: {query_states.shape}")
+            print(f"     ✅ Reshape successful!")
         except RuntimeError as e:
+            print(f"\n  ❌ Reshape failed!")
+            print(f"     - query_states shape: {query_states.shape}")
+            print(f"     - query_states size: {query_states.numel()}")
+            print(f"     - Target shape: [{batch_size}, {seq_len}, {self.num_heads}, {self.head_dim}]")
+            print(f"     - Target size: {batch_size * seq_len * self.num_heads * self.head_dim}")
+            print(f"     - Error: {e}")
+            # ✅ 실제 크기 계산
+            actual_total = query_states.numel()
+            actual_per_token = actual_total // (batch_size * seq_len)
+            print(f"     - Actual hidden per token: {actual_per_token}")
             raise
         # Retention 계산
         # Reshape back
         retention_states = retention_states.transpose(1, 2).contiguous()
         retention_states = retention_states.reshape(
+            batch_size, seq_len, self.hidden_size
         )
         # Group norm
     def _compute_retention(
         self,
+        queries: torch.Tensor,  # [B, H, L, D]
+        keys: torch.Tensor,     # [B, H, L, D]
+        values: torch.Tensor,   # [B, H, L, D]
         past_state: Optional[Tuple] = None
     ):
         """O(n) Retention 계산"""
         batch_size, num_heads, seq_len, head_dim = queries.shape
+        print(f"  🔄 Computing retention:")
+        print(f"     - queries: {queries.shape}")
+        print(f"     - keys: {keys.shape}")
+        print(f"     - values: {values.shape}")
         # State 초기화
         if past_state is not None:
             state = past_state
         # 순차 처리 (O(n))
         for t in range(seq_len):
+            q_t = queries[:, :, t, :]  # [B, H, D]
+            k_t = keys[:, :, t, :]     # [B, H, D]
+            v_t = values[:, :, t, :]   # [B, H, D]
             # Decay 적용
             decay = torch.sigmoid(self.decay).view(1, -1, 1, 1)
             state = decay * state
+            # State 업데이트: S = decay * S + k @ v^T
             state = state + torch.einsum('bhd,bhe->bhde', k_t, v_t)
+            # Output: q @ S
             output_t = torch.einsum('bhd,bhde->bhe', q_t, state)
             outputs.append(output_t)
+        output = torch.stack(outputs, dim=2)  # [B, H, L, D]
+        print(f"     - output: {output.shape}")
         return output
                 old_attn = layer.self_attn
                 config = model.config
+                print(f"\n  📐 Layer {layer_idx} - Original Attention:")
                 # ✅ 실제 가중치 shape 확인
                 if hasattr(old_attn, 'q_proj'):
+                    print(f"     - Q weight: {old_attn.q_proj.weight.shape}")
+                    print(f"     - K weight: {old_attn.k_proj.weight.shape}")
+                    print(f"     - V weight: {old_attn.v_proj.weight.shape}")
+                    print(f"     - O weight: {old_attn.o_proj.weight.shape}")
+                    # ✅ 실제 output 크기 확인
+                    actual_hidden = old_attn.q_proj.weight.shape[0]
+                    actual_input = old_attn.q_proj.weight.shape[1]
+                    print(f"     - Actual output dim: {actual_hidden}")
+                    print(f"     - Actual input dim: {actual_input}")
+                    print(f"     - Config hidden_size: {config.hidden_size}")
+                    # ✅ Config가 맞지 않으면 조정
+                    if actual_hidden != config.hidden_size or actual_input != config.hidden_size:
+                        print(f"     ⚠️ Dimension mismatch detected!")
+                        print(f"     Using actual dimensions: {actual_input} → {actual_hidden}")
+                        # 새로운 config 생성
+                        class CustomConfig:
+                            def __init__(self, hidden, heads):
+                                self.hidden_size = hidden
+                                self.num_attention_heads = heads
+                        config = CustomConfig(actual_hidden, model.config.num_attention_heads)
+                # PHOENIX Retention 생성
+                print(f"\n  🔄 Creating PHOENIX Retention for layer {layer_idx}...")
                 if use_hierarchical:
                     new_retention = HierarchicalRetention(config, layer_idx)
                 else:
                     new_retention = MultiScaleRetention(config, layer_idx)
+                # ✅ 가중치 복사 (shape 완벽히 확인)
                 if hasattr(old_attn, 'q_proj'):
+                    old_q_shape = old_attn.q_proj.weight.shape
+                    new_q_shape = new_retention.base_retention.q_proj.weight.shape
+                    print(f"\n  📋 Weight copy:")
+                    print(f"     - Old Q: {old_q_shape}")
+                    print(f"     - New Q: {new_q_shape}")
+                    if old_q_shape == new_q_shape:
+                        # Shape 일치 - 복사
                         new_retention.base_retention.q_proj.weight.data = \
                             old_attn.q_proj.weight.data.clone()
                         new_retention.base_retention.k_proj.weight.data = \
                             old_attn.v_proj.weight.data.clone()
                         new_retention.base_retention.o_proj.weight.data = \
                             old_attn.o_proj.weight.data.clone()
+                        print(f"     ✅ Weights copied successfully")
                     else:
+                        print(f"     ⚠️ Shape mismatch - using random initialization")
                 # 교체
                 layer.self_attn = new_retention
                 print(f"  ✅ Layer {layer_idx}: Attention → Retention")
         except Exception as e:
+            print(f"\n  ❌ Layer {layer_idx}: Conversion failed")
+            print(f"     Error: {e}")
             import traceback
             traceback.print_exc()
             continue