Spaces:

Heartsync
/

phoenix

Paused

App Files Files Community

seawolf2357 commited on 28 days ago

Commit

0d2bdda

verified ·

1 Parent(s): a01c0f9

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -21

app.py CHANGED Viewed

@@ -635,12 +635,20 @@ def replace_attention_with_retention(model, use_hierarchical=True, structure_inf
                         v_match = old_attn.v_proj.weight.shape == target.v_proj.weight.shape
                         o_match = old_attn.o_proj.weight.shape == target.o_proj.weight.shape
                         if q_match and k_match and v_match and o_match:
                             target.q_proj.weight.data = old_attn.q_proj.weight.data.clone()
                             target.k_proj.weight.data = old_attn.k_proj.weight.data.clone()
                             target.v_proj.weight.data = old_attn.v_proj.weight.data.clone()
                             target.o_proj.weight.data = old_attn.o_proj.weight.data.clone()
-                            print(f"  ✅ Layer {layer_idx}: Perfect match")
                         elif q_match and o_match:
                             target.q_proj.weight.data = old_attn.q_proj.weight.data.clone()
@@ -652,14 +660,17 @@ def replace_attention_with_retention(model, use_hierarchical=True, structure_inf
                             target.k_proj.weight.data[:k_copy_size] = old_attn.k_proj.weight.data[:k_copy_size].clone()
                             target.v_proj.weight.data[:v_copy_size] = old_attn.v_proj.weight.data[:v_copy_size].clone()
-                            print(f"  ✅ Layer {layer_idx}: Partial (GQA)")
                         else:
                             nn.init.xavier_uniform_(target.q_proj.weight)
                             nn.init.xavier_uniform_(target.k_proj.weight)
                             nn.init.xavier_uniform_(target.v_proj.weight)
                             nn.init.xavier_uniform_(target.o_proj.weight)
-                            print(f"  ⚠️ Layer {layer_idx}: Xavier init")
                     except Exception as e:
                         print(f"  ⚠️ Layer {layer_idx}: Weight copy failed - {e}")
@@ -989,6 +1000,8 @@ def replace_attention_with_retention(model, use_hierarchical=True):
     total_layers = len(layers)
     config = model.config
     for layer_idx, layer in enumerate(layers):
         if hasattr(layer, 'self_attn'):
             old_attn = layer.self_attn
@@ -1002,16 +1015,43 @@ def replace_attention_with_retention(model, use_hierarchical=True):
                 try:
                     target = new_retention.base_retention if use_hierarchical else new_retention
-                    if old_attn.q_proj.weight.shape == target.q_proj.weight.shape:
                         target.q_proj.weight.data = old_attn.q_proj.weight.data.clone()
-                    if old_attn.k_proj.weight.shape == target.k_proj.weight.shape:
                         target.k_proj.weight.data = old_attn.k_proj.weight.data.clone()
-                    if old_attn.v_proj.weight.shape == target.v_proj.weight.shape:
                         target.v_proj.weight.data = old_attn.v_proj.weight.data.clone()
-                    if old_attn.o_proj.weight.shape == target.o_proj.weight.shape:
                         target.o_proj.weight.data = old_attn.o_proj.weight.data.clone()
                 except Exception as e:
-                    print(f"Weight copy warning for layer {layer_idx}: {e}")
             layer.self_attn = new_retention
             converted_count += 1
@@ -1150,10 +1190,17 @@ class PhoenixModelForCausalLM(PhoenixPreTrainedModel):
                 print(f"      Missing keys: {len(missing)}")
                 print(f"      Unexpected keys: {len(unexpected)}")
                 # Retention 가중치 확인
                 retention_keys = [k for k in state_dict.keys() if 'retention' in k.lower()]
                 if retention_keys:
                     print(f"   ✅ Found {len(retention_keys)} Retention weight keys")
                 else:
                     print(f"   ⚠️ No Retention keys found in state dict")
@@ -2314,18 +2361,33 @@ def validate_phoenix_model(
             retention_count = 0
             attention_count = 0
-            if hasattr(model, 'model'):
-                layers = model.model.layers if hasattr(model.model, 'layers') else []
-                for layer in layers:
-                    if hasattr(layer, 'self_attn'):
-                        attn = layer.self_attn
-                        class_name = attn.__class__.__name__
-                        if 'Retention' in class_name:
-                            retention_count += 1
-                        else:
-                            attention_count += 1
             total = retention_count + attention_count
             retention_info = f"""
@@ -2334,7 +2396,7 @@ def validate_phoenix_model(
 - **Attention Layers**: {attention_count}/{total}
 - **Status**: {'✅ PHOENIX Active' if retention_count > 0 else '⚠️ No Retention Found'}
 """
-            print(f"   Retention: {retention_count}/{total} layers")
         # 4. 생성 테스트
         print(f"\n🚀 Running generation tests...")

                         v_match = old_attn.v_proj.weight.shape == target.v_proj.weight.shape
                         o_match = old_attn.o_proj.weight.shape == target.o_proj.weight.shape
+                        if layer_idx == 0:  # 첫 레이어만 상세 출력
+                            print(f"  🔍 Layer 0 shape analysis:")
+                            print(f"     Old Q: {old_attn.q_proj.weight.shape} vs New Q: {target.q_proj.weight.shape} → {'✅' if q_match else '❌'}")
+                            print(f"     Old K: {old_attn.k_proj.weight.shape} vs New K: {target.k_proj.weight.shape} → {'✅' if k_match else '❌'}")
+                            print(f"     Old V: {old_attn.v_proj.weight.shape} vs New V: {target.v_proj.weight.shape} → {'✅' if v_match else '❌'}")
+                            print(f"     Old O: {old_attn.o_proj.weight.shape} vs New O: {target.o_proj.weight.shape} → {'✅' if o_match else '❌'}")
                         if q_match and k_match and v_match and o_match:
                             target.q_proj.weight.data = old_attn.q_proj.weight.data.clone()
                             target.k_proj.weight.data = old_attn.k_proj.weight.data.clone()
                             target.v_proj.weight.data = old_attn.v_proj.weight.data.clone()
                             target.o_proj.weight.data = old_attn.o_proj.weight.data.clone()
+                            if layer_idx == 0:
+                                print(f"  ✅ Layer {layer_idx}: Perfect match - weights copied")
                         elif q_match and o_match:
                             target.q_proj.weight.data = old_attn.q_proj.weight.data.clone()
                             target.k_proj.weight.data[:k_copy_size] = old_attn.k_proj.weight.data[:k_copy_size].clone()
                             target.v_proj.weight.data[:v_copy_size] = old_attn.v_proj.weight.data[:v_copy_size].clone()
+                            if layer_idx == 0:
+                                print(f"  ✅ Layer {layer_idx}: Partial match (GQA) - partial weights copied")
                         else:
                             nn.init.xavier_uniform_(target.q_proj.weight)
                             nn.init.xavier_uniform_(target.k_proj.weight)
                             nn.init.xavier_uniform_(target.v_proj.weight)
                             nn.init.xavier_uniform_(target.o_proj.weight)
+                            if layer_idx == 0:
+                                print(f"  ⚠️ Layer {layer_idx}: Shape mismatch - Xavier init used")
+                                print(f"     This will result in random weights!")
                     except Exception as e:
                         print(f"  ⚠️ Layer {layer_idx}: Weight copy failed - {e}")
     total_layers = len(layers)
     config = model.config
+    print(f"Converting {total_layers} layers...")
     for layer_idx, layer in enumerate(layers):
         if hasattr(layer, 'self_attn'):
             old_attn = layer.self_attn
                 try:
                     target = new_retention.base_retention if use_hierarchical else new_retention
+                    # Shape 확인
+                    q_match = old_attn.q_proj.weight.shape == target.q_proj.weight.shape
+                    k_match = old_attn.k_proj.weight.shape == target.k_proj.weight.shape
+                    v_match = old_attn.v_proj.weight.shape == target.v_proj.weight.shape
+                    o_match = old_attn.o_proj.weight.shape == target.o_proj.weight.shape
+                    if layer_idx == 0:
+                        print(f"Layer 0 analysis:")
+                        print(f"  Q: {old_attn.q_proj.weight.shape} vs {target.q_proj.weight.shape} → {'✅' if q_match else '❌'}")
+                        print(f"  K: {old_attn.k_proj.weight.shape} vs {target.k_proj.weight.shape} → {'✅' if k_match else '❌'}")
+                        print(f"  V: {old_attn.v_proj.weight.shape} vs {target.v_proj.weight.shape} → {'✅' if v_match else '❌'}")
+                        print(f"  O: {old_attn.o_proj.weight.shape} vs {target.o_proj.weight.shape} → {'✅' if o_match else '❌'}")
+                    # 가중치 복사
+                    if q_match and k_match and v_match and o_match:
                         target.q_proj.weight.data = old_attn.q_proj.weight.data.clone()
                         target.k_proj.weight.data = old_attn.k_proj.weight.data.clone()
                         target.v_proj.weight.data = old_attn.v_proj.weight.data.clone()
                         target.o_proj.weight.data = old_attn.o_proj.weight.data.clone()
+                        if layer_idx == 0:
+                            print(f"  ✅ Perfect match - weights copied")
+                    elif q_match and o_match:
+                        target.q_proj.weight.data = old_attn.q_proj.weight.data.clone()
+                        target.o_proj.weight.data = old_attn.o_proj.weight.data.clone()
+                        k_copy_size = min(old_attn.k_proj.weight.shape[0], target.k_proj.weight.shape[0])
+                        v_copy_size = min(old_attn.v_proj.weight.shape[0], target.v_proj.weight.shape[0])
+                        target.k_proj.weight.data[:k_copy_size] = old_attn.k_proj.weight.data[:k_copy_size].clone()
+                        target.v_proj.weight.data[:v_copy_size] = old_attn.v_proj.weight.data[:v_copy_size].clone()
+                        if layer_idx == 0:
+                            print(f"  ✅ Partial match (GQA) - partial copy")
+                    else:
+                        if layer_idx == 0:
+                            print(f"  ⚠️ Shape mismatch - keeping random init")
                 except Exception as e:
+                    if layer_idx == 0:
+                        print(f"Weight copy error: {e}")
             layer.self_attn = new_retention
             converted_count += 1
                 print(f"      Missing keys: {len(missing)}")
                 print(f"      Unexpected keys: {len(unexpected)}")
+                # 상세 정보 출력 (처음 5개만)
+                if missing:
+                    print(f"      Missing (first 5): {missing[:5]}")
+                if unexpected:
+                    print(f"      Unexpected (first 5): {unexpected[:5]}")
                 # Retention 가중치 확인
                 retention_keys = [k for k in state_dict.keys() if 'retention' in k.lower()]
                 if retention_keys:
                     print(f"   ✅ Found {len(retention_keys)} Retention weight keys")
+                    print(f"      Sample keys: {retention_keys[:3]}")
                 else:
                     print(f"   ⚠️ No Retention keys found in state dict")
             retention_count = 0
             attention_count = 0
+            # PhoenixModelForCausalLM인 경우 _original_model 확인
+            check_model = model
+            if hasattr(model, '_original_model') and model._original_model is not None:
+                print(f"   📋 Detected PhoenixModelForCausalLM wrapper")
+                check_model = model._original_model
+            layers = []
+            if hasattr(check_model, 'model') and hasattr(check_model.model, 'layers'):
+                layers = check_model.model.layers
+            elif hasattr(check_model, 'layers'):
+                layers = check_model.layers
+            print(f"   🔍 Checking {len(layers)} layers...")
+            for i, layer in enumerate(layers):
+                if hasattr(layer, 'self_attn'):
+                    attn = layer.self_attn
+                    class_name = attn.__class__.__name__
+                    if 'Retention' in class_name:
+                        retention_count += 1
+                        if i < 3:  # 처음 3개만 출력
+                            print(f"   ✅ Layer {i}: {class_name}")
+                    else:
+                        attention_count += 1
+                        if i < 3:
+                            print(f"   ⚠️ Layer {i}: {class_name}")
             total = retention_count + attention_count
             retention_info = f"""
 - **Attention Layers**: {attention_count}/{total}
 - **Status**: {'✅ PHOENIX Active' if retention_count > 0 else '⚠️ No Retention Found'}
 """
+            print(f"   📊 Result: {retention_count}/{total} layers have Retention")
         # 4. 생성 테스트
         print(f"\n🚀 Running generation tests...")