Spaces:

Heartsync
/

phoenix

Paused

App Files Files Community

seawolf2357 commited on Nov 20

Commit

8863ba4

verified ·

1 Parent(s): cb3c4bf

Update app.py

Browse files

Files changed (1) hide show

app.py +113 -56

app.py CHANGED Viewed

@@ -534,38 +534,57 @@ def replace_attention_with_retention(model, use_hierarchical=True, structure_inf
     replaced_count = 0
     total_layers = 0
-    # structure_info 활용
     if structure_info and structure_info.get('layer_path'):
         layer_path = structure_info['layer_path']
         print(f"   Using structure info: {layer_path}")
         if layer_path == 'model.layers':
-            layers = model.model.layers if hasattr(model, 'model') and hasattr(model.model, 'layers') else None
         elif layer_path == 'transformer.h':
-            layers = model.transformer.h if hasattr(model, 'transformer') and hasattr(model.transformer, 'h') else None
         elif layer_path == 'layers':
-            layers = model.layers if hasattr(model, 'layers') else None
         elif layer_path == 'model.decoder.layers':
-            layers = model.model.decoder.layers if hasattr(model, 'model') and hasattr(model.model, 'decoder') and hasattr(model.model.decoder, 'layers') else None
-        else:
-            layers = None
-    else:
-        # 기존 방식대로 탐색
-        if hasattr(model, 'transformer'):
-            layers = model.transformer.h
-        elif hasattr(model, 'model') and hasattr(model.model, 'layers'):
-            layers = model.model.layers
-        elif hasattr(model, 'layers'):
-            layers = model.layers
-        else:
-            layers = None
     if layers is None:
-        print("⚠️ Unknown model structure - cannot find layers")
         return model, 0, 0
     total_layers = len(layers)
-    print(f"   Found {total_layers} layers")
     # GQA 감지 (structure_info 우선)
     if structure_info and structure_info.get('gqa_detected'):
@@ -944,43 +963,54 @@ class HierarchicalRetention(nn.Module):
 def replace_attention_with_retention(model, use_hierarchical=True):
-    """Attention → Retention 변환"""
     converted_count = 0
     total_layers = 0
     if hasattr(model, 'model') and hasattr(model.model, 'layers'):
         layers = model.model.layers
-        total_layers = len(layers)
-        config = model.config
-        for layer_idx, layer in enumerate(layers):
-            if hasattr(layer, 'self_attn'):
-                old_attn = layer.self_attn
-                if use_hierarchical:
-                    new_retention = HierarchicalRetention(config, layer_idx)
-                else:
-                    new_retention = MultiScaleRetention(config, layer_idx)
-                if hasattr(old_attn, 'q_proj'):
-                    try:
-                        target = new_retention.base_retention if use_hierarchical else new_retention
-                        if old_attn.q_proj.weight.shape == target.q_proj.weight.shape:
-                            target.q_proj.weight.data = old_attn.q_proj.weight.data.clone()
-                        if old_attn.k_proj.weight.shape == target.k_proj.weight.shape:
-                            target.k_proj.weight.data = old_attn.k_proj.weight.data.clone()
-                        if old_attn.v_proj.weight.shape == target.v_proj.weight.shape:
-                            target.v_proj.weight.data = old_attn.v_proj.weight.data.clone()
-                        if old_attn.o_proj.weight.shape == target.o_proj.weight.shape:
-                            target.o_proj.weight.data = old_attn.o_proj.weight.data.clone()
-                    except:
-                        pass
-                layer.self_attn = new_retention
-                converted_count += 1
     return model, converted_count, total_layers
@@ -1042,6 +1072,10 @@ class PhoenixModelForCausalLM(PhoenixPreTrainedModel):
         print(f"✅ Converted {converted}/{total} layers to Retention")
         phoenix_instance = cls(config)
         phoenix_instance._original_model = base_model
         phoenix_instance._initialized = True
@@ -1605,8 +1639,9 @@ def burn_model_zero_shot(
         print(f"\n🔄 STEP 3: Converting Attention → Retention...")
         convert_start = time.time()
-        model.model, converted, total = replace_attention_with_retention(
-            model.model,
             use_hierarchical=use_hierarchical,
             structure_info=structure_info
         )
@@ -1618,8 +1653,30 @@ def burn_model_zero_shot(
         if converted == 0:
             print(f"\n⚠️ WARNING: No layers were converted!")
-            print(f"   This model may not work correctly.")
-            print(f"   Structure info: {structure_info}")
         # 4. 평가
         print(f"\n📊 STEP 4: Evaluating model quality...")
@@ -1725,8 +1782,8 @@ def burn_model_with_finetuning(
             tokenizer.pad_token = tokenizer.eos_token
         print(f"\n🔄 STEP 3: Converting...")
-        model.model, converted, total = replace_attention_with_retention(
-            model.model,
             use_hierarchical=use_hierarchical,
             structure_info=structure_info
         )

     replaced_count = 0
     total_layers = 0
+    # 레이어 탐색 (여러 경로 시도)
+    layers = None
+    layer_path = None
+    # 1. structure_info 활용
     if structure_info and structure_info.get('layer_path'):
         layer_path = structure_info['layer_path']
         print(f"   Using structure info: {layer_path}")
         if layer_path == 'model.layers':
+            if hasattr(model, 'model') and hasattr(model.model, 'layers'):
+                layers = model.model.layers
         elif layer_path == 'transformer.h':
+            if hasattr(model, 'transformer') and hasattr(model.transformer, 'h'):
+                layers = model.transformer.h
         elif layer_path == 'layers':
+            if hasattr(model, 'layers'):
+                layers = model.layers
         elif layer_path == 'model.decoder.layers':
+            if hasattr(model, 'model') and hasattr(model.model, 'decoder') and hasattr(model.model.decoder, 'layers'):
+                layers = model.model.decoder.layers
+    # 2. 자동 탐색 (structure_info 없거나 실패 시)
+    if layers is None:
+        print(f"   Auto-detecting layer structure...")
+        possible_paths = [
+            ('model.layers', lambda m: m.model.layers if hasattr(m, 'model') and hasattr(m.model, 'layers') else None),
+            ('transformer.h', lambda m: m.transformer.h if hasattr(m, 'transformer') and hasattr(m.transformer, 'h') else None),
+            ('layers', lambda m: m.layers if hasattr(m, 'layers') else None),
+            ('model.decoder.layers', lambda m: m.model.decoder.layers if hasattr(m, 'model') and hasattr(m.model, 'decoder') and hasattr(m.model.decoder, 'layers') else None),
+        ]
+        for path_name, path_fn in possible_paths:
+            result = path_fn(model)
+            if result is not None:
+                layers = result
+                layer_path = path_name
+                print(f"   ✅ Found layers at: {path_name}")
+                break
     if layers is None:
+        print("❌ Cannot find layers - model structure not supported")
+        print(f"   Model type: {type(model)}")
+        print(f"   Has 'model' attr: {hasattr(model, 'model')}")
+        print(f"   Has 'transformer' attr: {hasattr(model, 'transformer')}")
+        print(f"   Has 'layers' attr: {hasattr(model, 'layers')}")
         return model, 0, 0
     total_layers = len(layers)
+    print(f"   Found {total_layers} layers at '{layer_path}'")
     # GQA 감지 (structure_info 우선)
     if structure_info and structure_info.get('gqa_detected'):
 def replace_attention_with_retention(model, use_hierarchical=True):
+    """Attention → Retention 변환 (개선됨)"""
     converted_count = 0
     total_layers = 0
+    # 레이어 찾기 (여러 경로 시도)
+    layers = None
     if hasattr(model, 'model') and hasattr(model.model, 'layers'):
         layers = model.model.layers
+    elif hasattr(model, 'transformer') and hasattr(model.transformer, 'h'):
+        layers = model.transformer.h
+    elif hasattr(model, 'layers'):
+        layers = model.layers
+    else:
+        print("Cannot find layers in model")
+        return model, 0, 0
+    total_layers = len(layers)
+    config = model.config
+    for layer_idx, layer in enumerate(layers):
+        if hasattr(layer, 'self_attn'):
+            old_attn = layer.self_attn
+            if use_hierarchical:
+                new_retention = HierarchicalRetention(config, layer_idx)
+            else:
+                new_retention = MultiScaleRetention(config, layer_idx)
+            if hasattr(old_attn, 'q_proj'):
+                try:
+                    target = new_retention.base_retention if use_hierarchical else new_retention
+                    if old_attn.q_proj.weight.shape == target.q_proj.weight.shape:
+                        target.q_proj.weight.data = old_attn.q_proj.weight.data.clone()
+                    if old_attn.k_proj.weight.shape == target.k_proj.weight.shape:
+                        target.k_proj.weight.data = old_attn.k_proj.weight.data.clone()
+                    if old_attn.v_proj.weight.shape == target.v_proj.weight.shape:
+                        target.v_proj.weight.data = old_attn.v_proj.weight.data.clone()
+                    if old_attn.o_proj.weight.shape == target.o_proj.weight.shape:
+                        target.o_proj.weight.data = old_attn.o_proj.weight.data.clone()
+                except Exception as e:
+                    print(f"Weight copy warning for layer {layer_idx}: {e}")
+            layer.self_attn = new_retention
+            converted_count += 1
+    print(f"Converted {converted_count}/{total_layers} layers to Retention")
     return model, converted_count, total_layers
         print(f"✅ Converted {converted}/{total} layers to Retention")
+        if converted == 0:
+            print(f"⚠️ WARNING: No layers were converted!")
+            print(f"   Model may not have Retention active.")
         phoenix_instance = cls(config)
         phoenix_instance._original_model = base_model
         phoenix_instance._initialized = True
         print(f"\n🔄 STEP 3: Converting Attention → Retention...")
         convert_start = time.time()
+        # ✅ FIX: 전체 모델을 전달하여 내부에서 레이어 찾기
+        model, converted, total = replace_attention_with_retention(
+            model,
             use_hierarchical=use_hierarchical,
             structure_info=structure_info
         )
         if converted == 0:
             print(f"\n⚠️ WARNING: No layers were converted!")
+            print(f"   This indicates a structural mismatch.")
+            print(f"   Model type: {type(model)}")
+            if structure_info:
+                print(f"   Structure info: {structure_info.get('layer_path', 'unknown')}")
+            print(f"   Please check the model architecture.")
+        else:
+            # 변환 검증
+            print(f"\n🔍 Verifying conversion...")
+            verified_retention = 0
+            if hasattr(model, 'model') and hasattr(model.model, 'layers'):
+                check_layers = model.model.layers
+            else:
+                check_layers = []
+            for layer in check_layers:
+                if hasattr(layer, 'self_attn'):
+                    if 'Retention' in layer.self_attn.__class__.__name__:
+                        verified_retention += 1
+            print(f"   ✅ Verified: {verified_retention}/{len(check_layers)} layers have Retention")
+            if verified_retention == 0 and converted > 0:
+                print(f"   ⚠️ WARNING: Conversion reported success but verification failed!")
         # 4. 평가
         print(f"\n📊 STEP 4: Evaluating model quality...")
             tokenizer.pad_token = tokenizer.eos_token
         print(f"\n🔄 STEP 3: Converting...")
+        model, converted, total = replace_attention_with_retention(
+            model,
             use_hierarchical=use_hierarchical,
             structure_info=structure_info
         )