Spaces:

Heartsync
/

phoenix

Paused

File size: 54,440 Bytes

28f2970
c381ead
 
c43a720
c381ead
 
 
 
 
 
 
 
28f2970
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef87883
 
c43a720
 
ef87883
c43a720
ef87883
 
 
18f492f
ef87883
35b8c28
 
28f2970
 
c381ead
28f2970
 
c381ead
28f2970
c381ead
 
23f9fc2
 
28f2970
ef87883
7916437
28f2970
35b8c28
 
 
23f9fc2
 
 
 
 
 
 
 
 
 
 
 
 
28f2970
c381ead
28f2970
e6bcdb0
c381ead
 
 
 
35b8c28
 
28f2970
cb3c4bf
b1fafe7
cb3c4bf
 
 
c43a720
cb3c4bf
 
 
 
 
 
 
 
 
 
c381ead
 
cb3c4bf
 
 
 
c381ead
cb3c4bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c381ead
cb3c4bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c381ead
 
 
cb3c4bf
c381ead
 
 
 
 
b1fafe7
c381ead
 
 
 
 
 
 
 
 
 
 
cb3c4bf
c43a720
cb3c4bf
 
 
 
 
 
 
 
c381ead
cb3c4bf
 
 
 
 
 
 
28f2970
c381ead
18f492f
 
 
ef87883
18f492f
 
 
 
 
068c039
18f492f
 
b1fafe7
 
 
 
 
d655ec6
0772ae3
 
 
 
18f492f
0772ae3
7916437
b1fafe7
 
0772ae3
 
b1fafe7
ef87883
 
b1fafe7
18f492f
ef87883
18f492f
 
 
 
b1fafe7
18f492f
 
0772ae3
c381ead
0772ae3
 
 
 
 
 
 
 
 
18f492f
 
 
 
 
 
 
 
d655ec6
 
 
18f492f
c381ead
0772ae3
18f492f
de99383
 
 
 
c43a720
de99383
ef87883
 
 
83d9107
068c039
0772ae3
068c039
18f492f
068c039
0772ae3
068c039
2c0487e
068c039
0772ae3
068c039
18f492f
0772ae3
 
 
c43a720
 
18f492f
 
 
 
7916437
18f492f
 
c43a720
 
41f8d59
0772ae3
 
 
18f492f
008f8f3
 
0772ae3
18f492f
c9f844d
18f492f
 
 
ef87883
 
 
18f492f
ef87883
0772ae3
18f492f
c43a720
 
 
 
 
18f492f
 
 
57a0735
 
 
 
b5c73ce
18f492f
ef87883
 
 
18f492f
 
008f8f3
 
 
 
18f492f
 
 
 
ef87883
2c0487e
c43a720
18f492f
0772ae3
18f492f
c381ead
18f492f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
068c039
 
 
18f492f
0772ae3
18f492f
 
41f8d59
8c55f6d
 
c43a720
 
41f8d59
98665cb
0772ae3
 
18f492f
 
98665cb
e6ac1c1
238a77b
 
 
18f492f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c9f844d
18f492f
 
 
b1fafe7
18f492f
 
cb3c4bf
c381ead
c43a720
18f492f
 
 
 
8863ba4
 
cb3c4bf
 
 
 
8863ba4
 
cb3c4bf
8863ba4
 
 
 
 
 
 
 
 
 
 
 
 
 
cb3c4bf
 
c43a720
18f492f
 
 
c43a720
b1fafe7
 
 
068c039
18f492f
 
 
 
2c0487e
18f492f
068c039
18f492f
068c039
18f492f
 
83d9107
c43a720
833e280
c43a720
 
 
 
 
 
18f492f
 
 
 
 
 
 
0772ae3
18f492f
 
 
 
d7d1b8f
c381ead
c43a720
 
 
 
 
 
 
 
23f9fc2
c381ead
c43a720
 
c381ead
c43a720
23f9fc2
 
 
 
c43a720
c381ead
c43a720
c381ead
c43a720
c381ead
 
c43a720
c381ead
c43a720
 
 
c381ead
 
 
 
 
 
 
c43a720
 
 
c381ead
c43a720
 
c381ead
c43a720
 
c381ead
 
 
 
 
 
c43a720
c381ead
c43a720
 
c381ead
 
 
 
 
 
c43a720
c381ead
c43a720
c381ead
 
 
 
 
 
 
 
 
 
 
c43a720
 
c381ead
 
 
c43a720
 
 
 
 
 
 
 
c381ead
c43a720
 
 
 
 
 
 
 
 
c381ead
 
 
c43a720
 
 
 
 
 
c381ead
c43a720
 
 
 
 
 
 
 
 
 
 
c381ead
c43a720
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c381ead
 
c43a720
 
 
 
 
 
 
 
 
 
 
c381ead
 
c43a720
c381ead
c43a720
 
 
 
 
 
 
 
 
 
 
 
 
 
c381ead
c43a720
c381ead
 
 
c43a720
c381ead
 
 
c43a720
 
 
 
 
c381ead
 
c43a720
 
 
 
c381ead
d7d1b8f
 
 
c43a720
d7d1b8f
7e1dc71
c43a720
 
 
 
d7d1b8f
 
 
 
7e1dc71
d7d1b8f
ad0a765
 
8fbd9ae
ad0a765
 
 
 
c43a720
7e1dc71
ad0a765
 
 
8fbd9ae
7e1dc71
ad0a765
d7d1b8f
7e1dc71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c43a720
7e1dc71
 
 
1fa5f7c
7e1dc71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7916437
 
7e1dc71
 
 
 
 
 
 
 
 
 
 
 
cce66a2
c43a720
3198863
 
1fa5f7c
c43a720
7e1dc71
 
3198863
7e1dc71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c43a720
7e1dc71
d7d1b8f
1fa5f7c
 
 
 
 
7e1dc71
 
cce66a2
c43a720
 
 
7e1dc71
 
 
 
 
 
 
7916437
 
 
 
 
7e1dc71
 
 
7916437
7e1dc71
c43a720
7e1dc71
 
 
d7d1b8f
7916437
7e1dc71
7916437
7e1dc71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7916437
7e1dc71
 
7916437
7e1dc71
 
 
 
 
 
 
 
d7d1b8f
7e1dc71
 
 
 
 
 
 
 
 
 
 
 
c43a720
7e1dc71
7916437
7e1dc71
 
 
7916437
7e1dc71
 
 
7916437
 
 
 
 
 
 
c381ead
7916437
 
 
c381ead
7916437
 
 
c43a720
7916437
 
c43a720
 
7916437
 
 
 
 
c43a720
7916437
 
 
 
c43a720
7916437
d7d1b8f
7916437
c43a720
7916437
 
 
 
d7d1b8f
c43a720
d7d1b8f
 
c43a720
d7d1b8f
 
c43a720
d7d1b8f
c381ead
 
d7d1b8f
c43a720
 
 
d7d1b8f
 
 
 
 
 
 
7916437
ad0a765
 
d7d1b8f
 
 
c381ead
d7d1b8f
 
c43a720
 
d7d1b8f
c381ead
ad0a765
 
35b8c28
 
 
 
 
ad0a765
35b8c28
c381ead
12f6cb3
35b8c28
 
 
 
c43a720
7916437
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c43a720
7916437
 
c43a720
951a5d5
 
c43a720
c381ead
c43a720
 
 
 
 
12f6cb3
15378c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c43a720
15378c4
 
 
 
 
c43a720
15378c4
 
 
 
 
7916437
c381ead
7916437
 
c43a720
15378c4
 
 
c43a720
 
 
 
c381ead
15378c4
c381ead
15378c4
c381ead
12f6cb3
 
15378c4
 
 
12f6cb3
c43a720
 
cb3c4bf
 
c381ead
 
15378c4
12f6cb3
15378c4
c381ead
 
15378c4
 
 
 
c381ead
 
12f6cb3
15378c4
 
 
12f6cb3
15378c4
c381ead
12f6cb3
c43a720
cb3c4bf
15378c4
12f6cb3
8863ba4
 
cb3c4bf
 
15378c4
12f6cb3
15378c4
 
12f6cb3
c43a720
ef87883
c381ead
c43a720
c381ead
c43a720
 
 
 
 
 
 
 
c381ead
c43a720
 
 
 
 
 
c381ead
ef87883
c43a720
 
 
 
ef87883
c43a720
 
ef87883
 
c43a720
ef87883
 
 
 
c43a720
 
c381ead
 
ef87883
 
 
d7d1b8f
ef87883
 
 
 
 
 
 
 
 
c43a720
c381ead
cb3c4bf
ef87883
 
 
c381ead
 
c43a720
ef87883
 
 
 
 
 
 
 
 
 
c43a720
ef87883
28f2970
 
7916437
c381ead
7916437
 
 
 
 
 
cc66f4c
7916437
 
 
 
 
 
c43a720
 
7916437
 
c43a720
 
c381ead
7916437
cc66f4c
7916437
 
 
c43a720
7916437
 
 
c43a720
c381ead
 
7916437
c43a720
 
 
 
 
 
c381ead
7916437
 
 
 
c43a720
7916437
 
 
 
 
cc66f4c
 
c43a720
 
 
cc66f4c
c43a720
cc66f4c
 
 
 
 
 
c43a720
 
 
 
c381ead
c43a720
 
cc66f4c
 
c381ead
cc66f4c
 
 
c43a720
cc66f4c
 
 
 
 
 
c381ead
c43a720
 
c381ead
 
cc66f4c
c43a720
 
7916437
 
 
c43a720
 
 
 
c381ead
7916437
cc66f4c
 
c43a720
cc66f4c
c43a720
cc66f4c
c43a720
 
 
 
 
 
 
cc66f4c
c43a720
 
cc66f4c
 
 
c43a720
 
 
c381ead
c43a720
cc66f4c
c43a720
cc66f4c
c381ead
 
 
 
cc66f4c
c43a720
 
 
 
 
 
cc66f4c
 
 
 
c381ead
 
c43a720
cc66f4c
 
 
 
c381ead
 
cc66f4c
 
 
 
c43a720
cc66f4c
 
c43a720
 
 
cc66f4c
c381ead
cc66f4c
 
 
 
 
c43a720
cc66f4c
 
c43a720
c381ead
cc66f4c
c43a720
cc66f4c
c43a720
cc66f4c
 
 
 
 
 
c43a720
 
c381ead
 
cc66f4c
 
c43a720
cc66f4c
 
 
 
28f2970
c43a720
28f2970
 
c381ead
bd3bb90
c381ead
 
bd3bb90
c381ead
bd3bb90
c381ead
 
 
c43a720
bd3bb90
 
 
 
 
 
 
 
c43a720
bd3bb90
 
 
 
c43a720
 
 
 
c381ead
c43a720
 
 
 
c381ead
c43a720
bd3bb90
c43a720
 
 
c381ead
c43a720
bd3bb90
 
c381ead
 
 
 
 
 
c43a720
bd3bb90
c381ead
 
 
 
 
 
c43a720
 
 
 
 
c381ead
c43a720
bd3bb90
c43a720
 
 
c381ead
c43a720
bd3bb90
c43a720
 
bd3bb90
c43a720
 
 
bd3bb90
 
 
 
 
 
 
 
 
 
c43a720
c381ead
c43a720
bd3bb90
 
 
 
c43a720
bd3bb90
 
c43a720
bd3bb90
c43a720
bd3bb90
 
c43a720
bd3bb90
 
 
 
c381ead
 
 
76e2b69
c381ead
 
 
 
 
 
bd3bb90
c43a720
c381ead
bd3bb90
28f2970
c43a720
28f2970
23f9fc2
 
 
 
 
 
 
 
bd3bb90
23f9fc2

"""
🔥 PHOENIX Retention Research Platform v2.0 - MULTI-GPU OPTIMIZED
H100 x 8 GPU 최적화 버전

✅ v2.0 NEW: Multi-GPU (8x H100) 최적화
✅ v2.0 NEW: Accelerate 통합
✅ v2.0 NEW: DeepSpeed ZeRO-3 지원
✅ v2.0 NEW: Gradient Checkpointing
✅ Fine-tuning 파이프라인 (Brumby-style)
✅ 모든 v1.4.3 수정사항 포함

VIDraft AI Research Lab - Multi-GPU Version v2.0
"""

import gradio as gr
import torch
import torch.nn as nn
import torch.nn.functional as F
import sqlite3
import json
import time
import numpy as np
from datetime import datetime
from pathlib import Path
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
from typing import Dict, List, Any, Tuple, Optional
from transformers import (
    AutoModel, AutoTokenizer, AutoConfig, AutoModelForCausalLM,
    get_cosine_schedule_with_warmup, TrainingArguments, Trainer,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset, concatenate_datasets
from torch.utils.data import Dataset, DataLoader
from accelerate import Accelerator
from tqdm import tqdm
import copy
import shutil
import os
from huggingface_hub import HfApi, create_repo

# =====================================================
# 전역 설정 - MULTI-GPU
# =====================================================

# GPU 설정
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
NUM_GPUS = torch.cuda.device_count()

# ✅ 우분투 호환: 홈 디렉토리 또는 환경 변수 사용
STORAGE_PATH = os.getenv("PHOENIX_STORAGE_PATH", str(Path.home() / "phoenix_data"))
DB_PATH = f"{STORAGE_PATH}/phoenix_experiments.db"
MODELS_PATH = f"{STORAGE_PATH}/phoenix_models"
DEFAULT_MODEL = "Qwen/Qwen3-0.6B"

# HuggingFace Token
HF_TOKEN = os.getenv("HF_TOKEN")

# 디렉토리 생성 (권한 오류 처리)
try:
    Path(STORAGE_PATH).mkdir(parents=True, exist_ok=True)
    Path(MODELS_PATH).mkdir(parents=True, exist_ok=True)
    print(f"✅ Storage initialized: {STORAGE_PATH}")
except PermissionError:
    print(f"⚠️ Permission denied for {STORAGE_PATH}")
    print(f"   Using current directory instead")
    STORAGE_PATH = "./phoenix_data"
    DB_PATH = f"{STORAGE_PATH}/phoenix_experiments.db"
    MODELS_PATH = f"{STORAGE_PATH}/phoenix_models"
    Path(STORAGE_PATH).mkdir(parents=True, exist_ok=True)
    Path(MODELS_PATH).mkdir(parents=True, exist_ok=True)

print(f"🔥 PHOENIX Platform v2.0 - Multi-GPU Optimized")
print(f"💾 Storage: {STORAGE_PATH}")
print(f"🎯 Default Base Model: {DEFAULT_MODEL}")
print(f"🚀 GPUs Available: {NUM_GPUS}")
if NUM_GPUS > 0:
    for i in range(NUM_GPUS):
        print(f"   GPU {i}: {torch.cuda.get_device_name(i)}")
if HF_TOKEN:
    print(f"🔑 HuggingFace Token: {'*' * 10}{HF_TOKEN[-4:]}")

# =====================================================
# 모델 구조 분석 함수
# =====================================================

def analyze_model_structure(model_url: str) -> Dict[str, Any]:
    """🔍 모델 구조 사전 분석"""
    print("\n" + "="*80)
    print("🔍 MODEL STRUCTURE ANALYSIS")
    print("="*80)
    
    try:
        print(f"\n📥 Loading model config: {model_url}")
        config = AutoConfig.from_pretrained(model_url, trust_remote_code=True)
        
        print(f"✅ Config loaded")
        
        # ✅ Multi-GPU: CPU로만 로드 (분석용)
        print(f"\n📦 Loading model structure (CPU only)...")
        model = AutoModelForCausalLM.from_pretrained(
            model_url,
            trust_remote_code=True,
            torch_dtype=torch.float16,
            device_map="cpu"  # Analysis만 CPU에서
        )
        
        analysis = {
            'model_url': model_url,
            'model_type': config.model_type if hasattr(config, 'model_type') else 'unknown',
            'architectures': config.architectures[0] if hasattr(config, 'architectures') else 'unknown',
            'hidden_size': config.hidden_size if hasattr(config, 'hidden_size') else 0,
            'num_attention_heads': config.num_attention_heads if hasattr(config, 'num_attention_heads') else 0,
            'num_hidden_layers': config.num_hidden_layers if hasattr(config, 'num_hidden_layers') else 0,
            'num_key_value_heads': config.num_key_value_heads if hasattr(config, 'num_key_value_heads') else None,
            'total_layers': 0,
            'has_self_attn': False,
            'layer_path': None,
        }
        
        # Layer 분석
        layers = None
        layer_path = None
        
        possible_paths = [
            ('model.layers', lambda m: m.model.layers if hasattr(m, 'model') and hasattr(m.model, 'layers') else None),
            ('transformer.h', lambda m: m.transformer.h if hasattr(m, 'transformer') and hasattr(m.transformer, 'h') else None),
        ]
        
        for path_name, path_fn in possible_paths:
            result = path_fn(model)
            if result is not None:
                layers = result
                layer_path = path_name
                break
        
        if layers:
            analysis['total_layers'] = len(layers)
            analysis['layer_path'] = layer_path
            
            if len(layers) > 0:
                first_layer = layers[0]
                if hasattr(first_layer, 'self_attn'):
                    analysis['has_self_attn'] = True
                    attn = first_layer.self_attn
                    
                    if hasattr(attn, 'q_proj'):
                        q_shape = attn.q_proj.weight.shape
                        k_shape = attn.k_proj.weight.shape
                        
                        if hasattr(config, 'num_attention_heads') and config.num_attention_heads > 0:
                            head_dim = q_shape[0] // config.num_attention_heads
                            analysis['head_dim'] = head_dim
                        
                        analysis['gqa_detected'] = (k_shape[0] != q_shape[0])
                        analysis['q_dim'] = q_shape[0]
                        analysis['k_dim'] = k_shape[0]
        
        print(f"\n{'='*80}\n")
        
        del model
        torch.cuda.empty_cache()
        
        return analysis
        
    except Exception as e:
        import traceback
        print(f"\n❌ Structure analysis failed: {e}")
        return {
            'model_url': model_url,
            'error': str(e),
            'total_layers': 0,
        }


# =====================================================
# PHOENIX Retention (동일)
# =====================================================

class MultiScaleRetention(nn.Module):
    """진짜 Retention Attention with GQA Support"""
    
    def __init__(self, config, layer_idx=0):
        super().__init__()
        self.config = config
        self.layer_idx = layer_idx
        
        self.hidden_size = config.hidden_size
        self.num_heads = config.num_attention_heads
        
        if hasattr(config, 'head_dim'):
            self.head_dim = config.head_dim
        else:
            self.head_dim = self.hidden_size // self.num_heads
        
        if hasattr(config, 'num_key_value_heads'):
            self.num_key_value_heads = config.num_key_value_heads
        else:
            self.num_key_value_heads = self.num_heads
        
        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
        self.kv_head_dim = self.head_dim
        
        self.q_dim = self.num_heads * self.head_dim
        self.kv_dim = self.num_key_value_heads * self.kv_head_dim
        
        self.q_proj = nn.Linear(self.hidden_size, self.q_dim, bias=False)
        self.k_proj = nn.Linear(self.hidden_size, self.kv_dim, bias=False)
        self.v_proj = nn.Linear(self.hidden_size, self.kv_dim, bias=False)
        self.o_proj = nn.Linear(self.q_dim, self.hidden_size, bias=False)
        
        decay_values = torch.linspace(0.95, 0.99, self.num_heads)
        self.decay = nn.Parameter(decay_values, requires_grad=True)
        
        self.group_norm = nn.GroupNorm(
            num_groups=self.num_heads, 
            num_channels=self.q_dim
        )
        
    def _repeat_kv(self, hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
        """Repeat K/V heads (GQA)"""
        batch, num_key_value_heads, slen, head_dim = hidden_states.shape
        if n_rep == 1:
            return hidden_states
        
        hidden_states = hidden_states[:, :, None, :, :].expand(
            batch, num_key_value_heads, n_rep, slen, head_dim
        )
        return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
        
    def forward(
        self, 
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.Tensor] = None,
        past_key_values: Optional[Tuple[torch.Tensor]] = None,
        **kwargs
    ):
        """O(n) Retention"""
        batch_size, seq_len, _ = hidden_states.shape
        
        target_device = hidden_states.device
        target_dtype = hidden_states.dtype
        
        if self.q_proj.weight.device != target_device or self.q_proj.weight.dtype != target_dtype:
            self.to(device=target_device, dtype=target_dtype)
        
        query_states = self.q_proj(hidden_states)
        key_states = self.k_proj(hidden_states)
        value_states = self.v_proj(hidden_states)
        
        query_states = query_states.view(
            batch_size, seq_len, self.num_heads, self.head_dim
        ).transpose(1, 2)
        
        key_states = key_states.view(
            batch_size, seq_len, self.num_key_value_heads, self.kv_head_dim
        ).transpose(1, 2)
        
        value_states = value_states.view(
            batch_size, seq_len, self.num_key_value_heads, self.kv_head_dim
        ).transpose(1, 2)
        
        key_states = self._repeat_kv(key_states, self.num_key_value_groups)
        value_states = self._repeat_kv(value_states, self.num_key_value_groups)
        
        retention_states = self._compute_retention(
            query_states, key_states, value_states
        )
        
        retention_states = retention_states.transpose(1, 2).contiguous()
        retention_states = retention_states.reshape(
            batch_size, seq_len, self.q_dim
        )
        
        if self.group_norm.weight.device != retention_states.device or self.group_norm.weight.dtype != retention_states.dtype:
            self.group_norm = self.group_norm.to(device=retention_states.device, dtype=retention_states.dtype)
        
        retention_states = self.group_norm(
            retention_states.transpose(1, 2)
        ).transpose(1, 2)
        
        retention_states = torch.clamp(retention_states, min=-10.0, max=10.0)
        
        attn_output = self.o_proj(retention_states)
        
        return (attn_output, None)
    
    def _compute_retention(
        self,
        queries: torch.Tensor,
        keys: torch.Tensor,
        values: torch.Tensor,
    ):
        """O(n) Retention computation"""
        batch_size, num_heads, seq_len, head_dim = queries.shape
        
        state = torch.zeros(
            batch_size, num_heads, head_dim, head_dim,
            dtype=queries.dtype,
            device=queries.device
        ) + 1e-6
        
        outputs = []
        
        decay = torch.sigmoid(self.decay).view(1, -1, 1, 1).to(
            device=queries.device, 
            dtype=queries.dtype
        )
        
        for t in range(seq_len):
            q_t = queries[:, :, t, :]
            k_t = keys[:, :, t, :]
            v_t = values[:, :, t, :]
            
            state = decay * state
            kv_update = torch.einsum('bhd,bhe->bhde', k_t, v_t)
            kv_update = torch.clamp(kv_update, min=-5.0, max=5.0)
            state = state + kv_update
            state = torch.clamp(state, min=-10.0, max=10.0)
            
            output_t = torch.einsum('bhd,bhde->bhe', q_t, state)
            outputs.append(output_t)
        
        output = torch.stack(outputs, dim=2)
        
        return output


class HierarchicalRetention(nn.Module):
    """PHOENIX Hierarchical Retention"""
    
    def __init__(self, config, layer_idx=0):
        super().__init__()
        self.base_retention = MultiScaleRetention(config, layer_idx)
        
        hidden_size = config.hidden_size
        self.d_state = hidden_size // 2
        
        self.short_proj = nn.Linear(hidden_size, self.d_state)
        self.medium_proj = nn.Linear(self.d_state, self.d_state)
        self.long_proj = nn.Linear(self.d_state, self.d_state * 2)
        self.fusion = nn.Linear(self.d_state * 4, hidden_size)
        
        self.short_decay = 0.5
        self.medium_decay = 0.8
        self.long_decay = 0.95
        
        self.norm = nn.LayerNorm(hidden_size)
    
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.Tensor] = None,
        past_key_values: Optional[Tuple[torch.Tensor]] = None,
        **kwargs
    ):
        """Hierarchical forward pass"""
        batch_size, seq_len, hidden_size = hidden_states.shape
        
        target_device = hidden_states.device
        target_dtype = hidden_states.dtype
        
        if self.short_proj.weight.device != target_device or self.short_proj.weight.dtype != target_dtype:
            self.to(device=target_device, dtype=target_dtype)
        
        base_result = self.base_retention(
            hidden_states, attention_mask, position_ids,
            past_key_value, output_attentions, use_cache
        )
        
        retention_output = base_result[0]
        
        short_state = torch.zeros(batch_size, self.d_state, dtype=target_dtype, device=target_device)
        medium_state = torch.zeros(batch_size, self.d_state, dtype=target_dtype, device=target_device)
        long_state = torch.zeros(batch_size, self.d_state * 2, dtype=target_dtype, device=target_device)
        
        hierarchical_outputs = []
        
        for t in range(seq_len):
            x_t = retention_output[:, t, :]
            
            short_input = self.short_proj(x_t)
            short_state = self.short_decay * short_state + short_input
            
            if t % 8 == 0:
                medium_state = self.medium_decay * medium_state + \
                              self.medium_proj(short_state)
            
            if t % 64 == 0:
                long_state = self.long_decay * long_state + \
                            self.long_proj(medium_state)
            
            combined = torch.cat([short_state, medium_state, long_state], dim=-1)
            output_t = self.fusion(combined)
            hierarchical_outputs.append(output_t)
        
        output = torch.stack(hierarchical_outputs, dim=1)
        output = self.norm(output)
        
        return (output, None)


# =====================================================
# 모델 변환 함수
# =====================================================

def replace_attention_with_retention(model, use_hierarchical=True, structure_info=None):
    """Transformer Attention → PHOENIX Retention"""
    print("🔄 Starting Attention → Retention conversion...")
    
    replaced_count = 0
    total_layers = 0
    
    layers = None
    
    if structure_info and structure_info.get('layer_path'):
        layer_path = structure_info['layer_path']
        
        if layer_path == 'model.layers':
            if hasattr(model, 'model') and hasattr(model.model, 'layers'):
                layers = model.model.layers
        elif layer_path == 'transformer.h':
            if hasattr(model, 'transformer') and hasattr(model.transformer, 'h'):
                layers = model.transformer.h
    
    if layers is None:
        possible_paths = [
            ('model.layers', lambda m: m.model.layers if hasattr(m, 'model') and hasattr(m.model, 'layers') else None),
            ('transformer.h', lambda m: m.transformer.h if hasattr(m, 'transformer') and hasattr(m.transformer, 'h') else None),
        ]
        
        for path_name, path_fn in possible_paths:
            result = path_fn(model)
            if result is not None:
                layers = result
                break
    
    if layers is None:
        print("❌ Cannot find layers")
        return model, 0, 0
    
    total_layers = len(layers)
    print(f"   Found {total_layers} layers")
    
    if structure_info and structure_info.get('head_dim'):
        model.config.head_dim = structure_info['head_dim']
    
    for layer_idx, layer in enumerate(layers):
        try:
            if hasattr(layer, 'self_attn'):
                old_attn = layer.self_attn
                
                if use_hierarchical:
                    new_retention = HierarchicalRetention(model.config, layer_idx)
                else:
                    new_retention = MultiScaleRetention(model.config, layer_idx)
                
                if hasattr(old_attn, 'q_proj'):
                    try:
                        target = new_retention.base_retention if use_hierarchical else new_retention
                        
                        target.q_proj.weight.data = old_attn.q_proj.weight.data.clone()
                        target.k_proj.weight.data = old_attn.k_proj.weight.data.clone()
                        target.v_proj.weight.data = old_attn.v_proj.weight.data.clone()
                        target.o_proj.weight.data = old_attn.o_proj.weight.data.clone()
                    except:
                        pass
                
                layer.self_attn = new_retention
                replaced_count += 1
                
        except Exception as e:
            continue
    
    print(f"\n✅ Conversion complete: {replaced_count}/{total_layers} layers")
    
    return model, replaced_count, total_layers


# =====================================================
# 🆕 MULTI-GPU Fine-tuning 파이프라인
# =====================================================

def finetune_retention_model(
    model,
    tokenizer,
    num_steps: int = 3000,
    batch_size: int = 4,
    learning_rate: float = 1e-5,
    output_dir: str = None,
    use_gradient_checkpointing: bool = True,
):
    """
    🆕 v2.0: Brumby-style Retraining with Multi-GPU Support
    """
    # output_dir 기본값 설정
    if output_dir is None:
        output_dir = f"{STORAGE_PATH}/finetuning_temp"
    
    print("\n" + "="*80)
    print("🔥 PHOENIX RETRAINING - Multi-GPU (v2.0)")
    print("="*80)
    print(f"   GPUs: {NUM_GPUS}")
    print(f"   Target Steps: {num_steps}")
    print(f"   Batch Size per GPU: {batch_size}")
    print(f"   Global Batch Size: {batch_size * NUM_GPUS}")
    print(f"   Learning Rate: {learning_rate}")
    print(f"   Gradient Checkpointing: {use_gradient_checkpointing}")
    
    start_time = time.time()
    
    # ✅ Gradient Checkpointing (메모리 절약)
    if use_gradient_checkpointing:
        if hasattr(model, 'gradient_checkpointing_enable'):
            model.gradient_checkpointing_enable()
            print(f"   ✅ Gradient Checkpointing enabled")
    
    # Dataset 준비
    train_dataset = prepare_simple_dataset(
        tokenizer=tokenizer,
        num_steps=num_steps,
        batch_size=batch_size * NUM_GPUS  # Multi-GPU 고려
    )
    
    # ✅ Multi-GPU Training Arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        
        # 🚀 Multi-GPU 설정
        per_device_train_batch_size=batch_size,  # GPU당 batch
        gradient_accumulation_steps=max(1, 8 // NUM_GPUS),  # GPU 수에 따라 조정
        
        # Training 설정
        num_train_epochs=1,
        max_steps=num_steps,
        learning_rate=learning_rate,
        warmup_steps=100,
        
        # Optimization
        fp16=True,  # Mixed precision
        optim="adamw_torch_fused",  # H100 최적화
        
        # Logging
        logging_steps=50,
        logging_first_step=True,
        save_steps=1000,
        save_total_limit=2,
        
        # Performance
        dataloader_num_workers=4 * NUM_GPUS,  # GPU당 4 workers
        dataloader_pin_memory=True,
        
        # Multi-GPU 관련
        ddp_find_unused_parameters=False,
        ddp_backend="nccl",  # H100 최적화
        
        # Misc
        remove_unused_columns=False,
        report_to="none",
        
        # ✅ DeepSpeed (선택사항)
        # deepspeed="ds_config.json",  # DeepSpeed 사용시
    )
    
    # Data collator
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )
    
    # ✅ Trainer (자동 Multi-GPU)
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )
    
    # Train!
    print(f"\n🚀 Starting Multi-GPU Fine-tuning...")
    print(f"   Using {NUM_GPUS} GPUs")
    
    trainer.train()
    
    elapsed = time.time() - start_time
    
    print(f"\n✅ Fine-tuning Complete!")
    print(f"   Time: {elapsed/60:.1f} minutes")
    print(f"   Effective samples/sec: {(num_steps * batch_size * NUM_GPUS) / elapsed:.2f}")
    print(f"="*80 + "\n")
    
    return model


def prepare_simple_dataset(
    tokenizer,
    num_steps: int,
    batch_size: int,
    max_length: int = 2048,
):
    """Dataset 준비"""
    print(f"\n📊 Preparing Dataset...")
    
    num_samples = num_steps * batch_size
    
    print(f"   Target samples: {num_samples}")
    
    try:
        dataset = load_dataset(
            "wikitext",
            "wikitext-2-raw-v1",
            split=f"train[:{num_samples}]"
        )
        print(f"   ✅ Loaded: {len(dataset)} samples")
    except Exception as e:
        print(f"   ❌ Failed: {e}")
        raise
    
    def tokenize_function(examples):
        return tokenizer(
            examples['text'],
            truncation=True,
            max_length=max_length,
            padding="max_length",
        )
    
    tokenized = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=dataset.column_names,
        num_proc=4  # Parallel processing
    )
    
    print(f"   ✅ Tokenized: {len(tokenized)} samples")
    
    return tokenized


def estimate_finetuning_cost(
    model_size: str,
    num_steps: int,
    batch_size: int,
    num_gpus: int = NUM_GPUS,
    gpu_type: str = "H100",
) -> Dict:
    """비용 계산기 - Multi-GPU"""
    gpu_costs = {
        "H100": 3.0,
        "A100": 2.0,
        "A10G": 1.0,
    }
    
    model_step_times = {
        "0.6B": 0.5,
        "1.5B": 1.0,
        "3B": 2.0,
        "7B": 3.5,
        "14B": 6.0,
    }
    
    # Multi-GPU로 인한 시간 단축 (linear scaling 가정)
    step_time = model_step_times.get(model_size, 1.0) * (batch_size / 4)
    step_time_per_gpu = step_time / num_gpus  # GPU 병렬화
    
    total_seconds = num_steps * step_time_per_gpu
    total_hours = total_seconds / 3600
    
    # 비용은 GPU 수만큼 곱함
    total_cost_usd = total_hours * gpu_costs.get(gpu_type, 2.0) * num_gpus
    
    return {
        'hours': round(total_hours, 2),
        'cost_usd': round(total_cost_usd, 2),
        'cost_krw': round(total_cost_usd * 1300, 0),
        'num_gpus': num_gpus,
        'gpu_type': gpu_type,
    }


# =====================================================
# Custom Modeling Code (동일)
# =====================================================

def generate_modeling_phoenix_code():
    """PHOENIX Custom Modeling Code v2.0"""
    
    return '''"""
PHOENIX Retention Model v2.0
✅ v2.0: Brumby-style Retraining support
✅ v1.4.3: forward() 시그니처 Transformers 호환
✅ v1.4.3: dtype 불일치 수정
"""

import torch
import torch.nn as nn
from typing import Optional, Tuple
from transformers.modeling_utils import PreTrainedModel
from transformers.configuration_utils import PretrainedConfig
from transformers import AutoConfig, AutoModelForCausalLM
import os


class PhoenixConfig(PretrainedConfig):
    model_type = "phoenix"
    def __init__(self, use_phoenix_retention=True, phoenix_version="2.0", 
                 original_model=None, use_hierarchical=True, **kwargs):
        super().__init__(**kwargs)
        self.use_phoenix_retention = use_phoenix_retention
        self.phoenix_version = phoenix_version
        self.original_model = original_model
        self.use_hierarchical = use_hierarchical


class MultiScaleRetention(nn.Module):
    def __init__(self, config, layer_idx=0):
        super().__init__()
        self.hidden_size = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_dim = getattr(config, 'head_dim', self.hidden_size // self.num_heads)
        self.num_key_value_heads = getattr(config, 'num_key_value_heads', self.num_heads)
        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
        self.q_dim = self.num_heads * self.head_dim
        self.kv_dim = self.num_key_value_heads * self.head_dim
        
        self.q_proj = nn.Linear(self.hidden_size, self.q_dim, bias=False)
        self.k_proj = nn.Linear(self.hidden_size, self.kv_dim, bias=False)
        self.v_proj = nn.Linear(self.hidden_size, self.kv_dim, bias=False)
        self.o_proj = nn.Linear(self.q_dim, self.hidden_size, bias=False)
        self.decay = nn.Parameter(torch.linspace(0.95, 0.99, self.num_heads))
        self.group_norm = nn.GroupNorm(self.num_heads, self.q_dim)
    
    def _repeat_kv(self, x, n):
        b, h, s, d = x.shape
        if n == 1: return x
        return x[:, :, None, :, :].expand(b, h, n, s, d).reshape(b, h*n, s, d)
    
    def forward(self, hidden_states, **kwargs):
        b, s, _ = hidden_states.shape
        device, dtype = hidden_states.device, hidden_states.dtype
        
        if self.q_proj.weight.device != device or self.q_proj.weight.dtype != dtype:
            self.to(device=device, dtype=dtype)
        
        q = self.q_proj(hidden_states).view(b, s, self.num_heads, self.head_dim).transpose(1, 2)
        k = self.k_proj(hidden_states).view(b, s, self.num_key_value_heads, self.head_dim).transpose(1, 2)
        v = self.v_proj(hidden_states).view(b, s, self.num_key_value_heads, self.head_dim).transpose(1, 2)
        
        k = self._repeat_kv(k, self.num_key_value_groups)
        v = self._repeat_kv(v, self.num_key_value_groups)
        
        out = self._retention(q, k, v)
        out = out.transpose(1, 2).reshape(b, s, self.q_dim)
        out = self.group_norm(out.transpose(1, 2)).transpose(1, 2)
        return (self.o_proj(torch.clamp(out, -10, 10)), None)
    
    def _retention(self, q, k, v):
        b, h, s, d = q.shape
        state = torch.zeros(b, h, d, d, dtype=q.dtype, device=q.device) + 1e-6
        decay = torch.sigmoid(self.decay).view(1, -1, 1, 1).to(q)
        outs = []
        for t in range(s):
            state = decay * state + torch.clamp(torch.einsum('bhd,bhe->bhde', k[:,:,t], v[:,:,t]), -5, 5)
            state = torch.clamp(state, -10, 10)
            outs.append(torch.einsum('bhd,bhde->bhe', q[:,:,t], state))
        return torch.stack(outs, dim=2)


class HierarchicalRetention(nn.Module):
    def __init__(self, config, layer_idx=0):
        super().__init__()
        self.base_retention = MultiScaleRetention(config, layer_idx)
        h = config.hidden_size
        self.d_state = h // 2
        self.short_proj = nn.Linear(h, self.d_state)
        self.medium_proj = nn.Linear(self.d_state, self.d_state)
        self.long_proj = nn.Linear(self.d_state, self.d_state*2)
        self.fusion = nn.Linear(self.d_state*4, h)
        self.norm = nn.LayerNorm(h)
        self.decays = [0.5, 0.8, 0.95]
    
    def forward(self, hidden_states, **kwargs):
        b, s, h = hidden_states.shape
        device, dtype = hidden_states.device, hidden_states.dtype
        
        if self.short_proj.weight.device != device or self.short_proj.weight.dtype != dtype:
            self.to(device=device, dtype=dtype)
        
        ret_out = self.base_retention(hidden_states)[0]
        short = torch.zeros(b, self.d_state, dtype=dtype, device=device)
        med = torch.zeros(b, self.d_state, dtype=dtype, device=device)
        long = torch.zeros(b, self.d_state*2, dtype=dtype, device=device)
        outs = []
        
        for t in range(s):
            short = self.decays[0]*short + self.short_proj(ret_out[:,t])
            if t % 8 == 0: med = self.decays[1]*med + self.medium_proj(short)
            if t % 64 == 0: long = self.decays[2]*long + self.long_proj(med)
            outs.append(self.fusion(torch.cat([short, med, long], -1)))
        
        return (self.norm(torch.stack(outs, 1)), None)


def replace_attention_with_retention_for_loading(model, use_hierarchical=True):
    layers = getattr(model, 'model', model)
    layers = getattr(layers, 'layers', getattr(layers, 'h', None))
    if layers is None: return model, 0, 0
    
    original_dtype = None
    for param in model.parameters():
        original_dtype = param.dtype
        break
    
    cnt = 0
    for i, layer in enumerate(layers):
        if hasattr(layer, 'self_attn'):
            new_ret = HierarchicalRetention(model.config, i) if use_hierarchical else MultiScaleRetention(model.config, i)
            if original_dtype: new_ret = new_ret.to(dtype=original_dtype)
            layer.self_attn = new_ret
            cnt += 1
    return model, cnt, len(layers)


class PhoenixPreTrainedModel(PreTrainedModel):
    config_class = PhoenixConfig
    base_model_prefix = "phoenix"


class PhoenixModelForCausalLM(PhoenixPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self._model = None
        self._ready = False
    
    @classmethod
    def from_pretrained(cls, path, *args, **kwargs):
        print(f"🔥 PHOENIX v2.0 loading from {path}")
        config = AutoConfig.from_pretrained(path, trust_remote_code=True)
        orig = getattr(config, 'original_model', 'Qwen/Qwen3-0.6B')
        hier = getattr(config, 'use_hierarchical', True)
        
        try:
            base_cfg = AutoConfig.from_pretrained(orig, trust_remote_code=True)
        except:
            base_cfg = config
        
        model = AutoModelForCausalLM.from_config(base_cfg)
        model, conv, tot = replace_attention_with_retention_for_loading(model, hier)
        print(f"   ✅ Converted {conv}/{tot} layers")
        
        sd = None
        if os.path.exists(path):
            for fname in ["model.safetensors", "pytorch_model.bin"]:
                fpath = os.path.join(path, fname)
                if os.path.exists(fpath):
                    if fname.endswith('.safetensors'):
                        from safetensors.torch import load_file
                        sd = load_file(fpath)
                    else:
                        sd = torch.load(fpath, map_location='cpu')
                    break
        else:
            from huggingface_hub import hf_hub_download
            for fname in ["model.safetensors", "pytorch_model.bin"]:
                try:
                    fpath = hf_hub_download(path, fname)
                    if fname.endswith('.safetensors'):
                        from safetensors.torch import load_file
                        sd = load_file(fpath)
                    else:
                        sd = torch.load(fpath, map_location='cpu')
                    break
                except: pass
        
        if sd:
            miss, unex = model.load_state_dict(sd, strict=False)
            print(f"   📦 Weights: {len(miss)} missing, {len(unex)} unexpected")
            
            if 'lm_head.weight' in miss and getattr(config, 'tie_word_embeddings', False):
                if hasattr(model, 'lm_head') and hasattr(model.model, 'embed_tokens'):
                    model.lm_head.weight = model.model.embed_tokens.weight
                    print(f"   🔗 Tied embeddings")
        
        inst = cls(config)
        inst._model = model
        inst._ready = True
        print(f"✅ PHOENIX v2.0 ready!")
        return inst
    
    def forward(self, *a, **k):
        if not self._ready: raise ValueError("Not initialized")
        return self._model(*a, **k)
    
    def generate(self, *a, **k):
        if not self._ready: raise ValueError("Not initialized")
        return self._model.generate(*a, **k)


AutoConfig.register("phoenix", PhoenixConfig)
'''


# =====================================================
# 저장/업로드/평가 (동일)
# =====================================================

def save_phoenix_model_with_code(model, tokenizer, output_path, original_model_url, metadata):
    """PHOENIX 모델 저장"""
    output_path = Path(output_path)
    output_path.mkdir(parents=True, exist_ok=True)
    
    print(f"\n💾 Saving PHOENIX model...")
    
    if hasattr(model.config, 'tie_word_embeddings') and model.config.tie_word_embeddings:
        if hasattr(model, 'lm_head') and hasattr(model, 'model') and hasattr(model.model, 'embed_tokens'):
            model.lm_head.weight = model.model.embed_tokens.weight
    
    model.save_pretrained(output_path)
    tokenizer.save_pretrained(output_path)
    
    modeling_code = generate_modeling_phoenix_code()
    with open(output_path / "modeling_phoenix.py", "w") as f:
        f.write(modeling_code)
    
    config_path = output_path / "config.json"
    if config_path.exists():
        with open(config_path, "r") as f:
            config_dict = json.load(f)
        
        config_dict["use_phoenix_retention"] = True
        config_dict["phoenix_version"] = "2.0"
        config_dict["original_model"] = original_model_url
        config_dict["auto_map"] = {
            "AutoModelForCausalLM": "modeling_phoenix.PhoenixModelForCausalLM",
        }
        
        with open(config_path, "w") as f:
            json.dump(config_dict, f, indent=2)
    
    with open(output_path / 'phoenix_metadata.json', 'w') as f:
        json.dump(metadata, f, indent=2)
    
    readme = f"""# 🔥 PHOENIX v2.0 - {original_model_url}

**Multi-GPU Trained** with {metadata.get('num_gpus', 1)} GPUs

## Features
- ✅ Brumby-style Retraining
- ✅ O(n) Complexity
- ✅ GQA Support

## Usage
```python
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained(
    "{output_path.name}",
    trust_remote_code=True,
    torch_dtype="auto",
    device_map="auto"
)
```

**VIDraft AI Research Lab** | PHOENIX v2.0 Multi-GPU
"""
    
    with open(output_path / "README.md", "w") as f:
        f.write(readme)
    
    print(f"   ✅ Model saved")


def upload_to_huggingface_hub(
    model_path: str,
    original_model_url: str,
    repo_name: str = None,
    private: bool = True,
    token: str = None,
) -> Tuple[bool, str, str]:
    """Upload to Hub"""
    
    if token is None:
        token = HF_TOKEN
    
    if not token:
        return False, "", "❌ No HF_TOKEN"
    
    try:
        api = HfApi(token=token)
        user_info = api.whoami(token=token)
        username = user_info['name']
        
        if not repo_name:
            base_name = original_model_url.split('/')[-1]
            repo_name = f"phoenix-{base_name}"
        
        repo_id = f"{username}/{repo_name}"
        
        create_repo(
            repo_id=repo_id,
            token=token,
            private=private,
            repo_type="model",
            exist_ok=True
        )
        
        api.upload_folder(
            folder_path=str(model_path),
            repo_id=repo_id,
            repo_type="model",
            token=token,
        )
        
        hub_url = f"https://huggingface.co/{repo_id}"
        
        return True, hub_url, f"✅ Uploaded to {hub_url}"
        
    except Exception as e:
        return False, "", f"❌ Upload failed: {e}"


def evaluate_model_quality(model, tokenizer):
    """Quality 평가"""
    test_prompts = [
        "The capital of France is",
        "In machine learning,",
        "2 + 2 =",
    ]
    
    model.eval()
    scores = []
    
    with torch.no_grad():
        for prompt in test_prompts:
            try:
                inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=20,
                    do_sample=False,
                    pad_token_id=tokenizer.eos_token_id,
                )
                generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
                
                score = 0.0
                if len(generated) > len(prompt):
                    score += 0.3
                if not any(c in generated[len(prompt):] for c in ['�', '[UNK]']):
                    score += 0.3
                if len(generated.split()) > len(prompt.split()) + 2:
                    score += 0.4
                
                scores.append(score)
            except:
                scores.append(0.0)
    
    return sum(scores) / len(scores) if scores else 0.0


# =====================================================
# 🆕 Multi-GPU Burning 함수
# =====================================================

def burn_model_with_finetuning(
    model_url: str,
    output_dir: str,
    use_hierarchical: bool = True,
    enable_finetuning: bool = False,
    num_steps: int = 3000,
    batch_size: int = 4,
    learning_rate: float = 1e-5,
    use_gradient_checkpointing: bool = True,
):
    """🆕 v2.0: Multi-GPU Optimized Burning"""
    print("="*80)
    print(f"🔥 PHOENIX Model Burning v2.0 - Multi-GPU ({NUM_GPUS} GPUs)")
    print("="*80)
    
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)
    
    try:
        # STEP 1: Structure Analysis
        print(f"\n🔍 STEP 1: Structure Analysis...")
        structure_info = analyze_model_structure(model_url)
        
        # STEP 2: Load Model with device_map="auto"
        print(f"\n📥 STEP 2: Loading model (Multi-GPU)...")
        start_time = time.time()
        
        config = AutoConfig.from_pretrained(model_url, trust_remote_code=True)
        
        # ✅ Multi-GPU: device_map="auto"로 자동 분산
        model = AutoModelForCausalLM.from_pretrained(
            model_url,
            trust_remote_code=True,
            torch_dtype=torch.float16,
            device_map="auto"  # 자동으로 8개 GPU에 분산!
        )
        
        tokenizer = AutoTokenizer.from_pretrained(model_url, trust_remote_code=True)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        
        load_time = time.time() - start_time
        print(f"✅ Loaded across {NUM_GPUS} GPUs in {load_time:.1f}s")
        
        # STEP 3: Convert
        print(f"\n🔄 STEP 3: Converting Attention → Retention...")
        convert_start = time.time()
        
        model, converted, total = replace_attention_with_retention(
            model,
            use_hierarchical=use_hierarchical,
            structure_info=structure_info
        )
        
        convert_time = time.time() - convert_start
        conversion_rate = converted / total if total > 0 else 0
        
        print(f"✅ Converted {converted}/{total} layers in {convert_time:.1f}s")
        
        # STEP 4: Fine-tuning (Multi-GPU)
        if enable_finetuning:
            print(f"\n🚀 STEP 4: Multi-GPU Fine-tuning...")
            ft_start = time.time()
            
            model = finetune_retention_model(
                model=model,
                tokenizer=tokenizer,
                num_steps=num_steps,
                batch_size=batch_size,
                learning_rate=learning_rate,
                use_gradient_checkpointing=use_gradient_checkpointing,
            )
            
            ft_time = time.time() - ft_start
            print(f"✅ Fine-tuning completed in {ft_time/60:.1f} minutes")
        else:
            ft_time = 0
            print(f"\n⏭️ STEP 4: Fine-tuning skipped")
        
        # STEP 5: Evaluate
        print(f"\n📊 STEP 5: Evaluating...")
        quality_score = evaluate_model_quality(model, tokenizer)
        print(f"✅ Quality: {quality_score:.2f}/1.00")
        
        # STEP 6: Save
        print(f"\n💾 STEP 6: Saving...")
        
        metadata = {
            'phoenix_version': '2.0',
            'original_model': model_url,
            'use_hierarchical': use_hierarchical,
            'conversion_rate': conversion_rate,
            'quality_score': quality_score,
            'finetuned': enable_finetuning,
            'finetuning_steps': num_steps if enable_finetuning else 0,
            'num_gpus': NUM_GPUS,
            'gradient_checkpointing': use_gradient_checkpointing,
            'timestamp': datetime.now().isoformat(),
        }
        
        save_phoenix_model_with_code(model, tokenizer, output_path, model_url, metadata)
        
        total_time = time.time() - start_time
        
        result = {
            'status': 'success',
            'model_path': str(output_path),
            'conversion_rate': conversion_rate,
            'quality_score': quality_score,
            'total_time': total_time,
            'finetuned': enable_finetuning,
            'num_gpus': NUM_GPUS,
            'structure_info': structure_info,
        }
        
        print(f"\n{'='*80}")
        print(f"✅ Multi-GPU Burning Complete!")
        print(f"   GPUs Used: {NUM_GPUS}")
        print(f"   Model: {output_path}")
        print(f"   Quality: {quality_score:.2f}/1.00")
        print(f"{'='*80}\n")
        
        return result
        
    except Exception as e:
        import traceback
        return {
            'status': 'failed',
            'error': str(e),
            'traceback': traceback.format_exc()
        }


# =====================================================
# Database (동일)
# =====================================================

class ExperimentDatabase:
    def __init__(self, db_path: str):
        self.db_path = db_path
        self.init_database()
    
    def init_database(self):
        with sqlite3.connect(self.db_path) as conn:
            cursor = conn.cursor()
            cursor.execute("""
                CREATE TABLE IF NOT EXISTS burning_history (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    model_url TEXT,
                    output_path TEXT,
                    hub_url TEXT,
                    conversion_rate REAL,
                    quality_score REAL,
                    finetuned BOOLEAN,
                    num_gpus INTEGER,
                    timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
                )
            """)
            conn.commit()
    
    def save_burning(self, info: Dict) -> int:
        with sqlite3.connect(self.db_path) as conn:
            cursor = conn.cursor()
            cursor.execute("""
                INSERT INTO burning_history 
                (model_url, output_path, hub_url, conversion_rate, quality_score, finetuned, num_gpus)
                VALUES (?, ?, ?, ?, ?, ?, ?)
            """, (
                info.get('model_url'),
                info.get('output_path'),
                info.get('hub_url'),
                info.get('conversion_rate'),
                info.get('quality_score'),
                info.get('finetuned'),
                info.get('num_gpus', 1),
            ))
            conn.commit()
            return cursor.lastrowid
    
    def get_history(self, limit: int = 20) -> List[Dict]:
        with sqlite3.connect(self.db_path) as conn:
            conn.row_factory = sqlite3.Row
            cursor = conn.cursor()
            cursor.execute("SELECT * FROM burning_history ORDER BY timestamp DESC LIMIT ?", (limit,))
            return [dict(row) for row in cursor.fetchall()]


db = ExperimentDatabase(DB_PATH)


# =====================================================
# Gradio UI
# =====================================================

def burn_phoenix_model_ui(
    model_url,
    use_hierarchical,
    output_name,
    enable_finetuning,
    ft_steps,
    ft_batch,
    ft_lr,
    use_grad_ckpt,
    upload_hub,
    hub_repo,
    hub_private,
):
    """Gradio UI"""
    
    try:
        if not model_url.strip():
            return "⚠️ Model URL required", None
        
        if not output_name.strip():
            output_name = f"phoenix_{model_url.split('/')[-1]}_{int(time.time())}"
        
        output_dir = f"{MODELS_PATH}/{output_name}"
        
        # 비용 추정
        if enable_finetuning:
            model_size = "0.6B" if "0.6B" in model_url else "1.5B"
            cost = estimate_finetuning_cost(model_size, ft_steps, ft_batch, NUM_GPUS)
            print(f"\n💰 Estimated Cost: ${cost['cost_usd']} ({cost['hours']}h with {NUM_GPUS} GPUs)")
        
        # Burn
        result = burn_model_with_finetuning(
            model_url=model_url,
            output_dir=output_dir,
            use_hierarchical=use_hierarchical,
            enable_finetuning=enable_finetuning,
            num_steps=ft_steps,
            batch_size=ft_batch,
            learning_rate=ft_lr,
            use_gradient_checkpointing=use_grad_ckpt,
        )
        
        if result['status'] != 'success':
            return f"❌ Failed\n```\n{result.get('error')}\n```", None
        
        # Upload
        hub_url = None
        if upload_hub and HF_TOKEN:
            success, hub_url, msg = upload_to_huggingface_hub(
                model_path=result['model_path'],
                original_model_url=model_url,
                repo_name=hub_repo if hub_repo.strip() else None,
                private=hub_private,
            )
        
        # DB
        db.save_burning({
            'model_url': model_url,
            'output_path': result['model_path'],
            'hub_url': hub_url,
            'conversion_rate': result['conversion_rate'],
            'quality_score': result['quality_score'],
            'finetuned': enable_finetuning,
            'num_gpus': NUM_GPUS,
        })
        
        # Output
        output_md = f"""
# 🔥 PHOENIX v2.0 Multi-GPU Complete!

## Hardware
- **GPUs Used**: {NUM_GPUS} x {torch.cuda.get_device_name(0) if NUM_GPUS > 0 else 'N/A'}

## Model Info
- **Original**: {model_url}
- **Output**: `{result['model_path']}`
- **Conversion**: {result['conversion_rate']*100:.1f}%
- **Quality**: {result['quality_score']:.2f}/1.00
- **Fine-tuned**: {'✅ YES' if enable_finetuning else '❌ NO'}
"""
        
        if hub_url:
            output_md += f"""

## Hub Status
✅ **Uploaded**: [{hub_url}]({hub_url})

```python
model = AutoModelForCausalLM.from_pretrained(
    "{hub_url.replace('https://huggingface.co/', '')}",
    trust_remote_code=True,
    device_map="auto"  # Multi-GPU
)
```
"""
        
        # Plot
        fig = go.Figure()
        fig.add_trace(go.Bar(
            x=['Conversion', 'Quality'],
            y=[result['conversion_rate'], result['quality_score']],
            marker_color=['#3b82f6', '#10b981']
        ))
        fig.update_layout(title=f"Metrics ({NUM_GPUS} GPUs)", yaxis_range=[0, 1])
        
        return output_md, fig
        
    except Exception as e:
        import traceback
        return f"❌ Error:\n```\n{traceback.format_exc()}\n```", None


def view_history():
    """History"""
    try:
        history = db.get_history(20)
        if not history:
            return "📭 No history", None
        
        df = pd.DataFrame(history)
        
        fig = px.scatter(
            df,
            x='timestamp',
            y='quality_score',
            color='finetuned',
            size='num_gpus',
            title='Burning History (Multi-GPU)'
        )
        
        return f"## History\n\n{df.to_markdown(index=False)}", fig
    except Exception as e:
        return f"❌ Error: {e}", None


# =====================================================
# Gradio App
# =====================================================

with gr.Blocks(title="🔥 PHOENIX v2.0 Multi-GPU", theme=gr.themes.Soft()) as demo:
    
    gr.Markdown(f"""
    # 🔥 PHOENIX v2.0 - Multi-GPU Optimized
    
    **H100 x {NUM_GPUS} GPUs Ready**
    
    🆕 **v2.0 Multi-GPU**: Accelerate 통합, DDP 지원  
    🆕 **v2.0**: Fine-tuning 파이프라인 (Brumby-style)  
    ✅ v1.4.3: All fixes included  
    ✅ GQA Support | O(n) Complexity
    
    ---
    """)
    
    with gr.Tabs():
        with gr.Tab("🔥 Model Burning"):
            with gr.Row():
                with gr.Column(scale=1):
                    burn_url = gr.Textbox(
                        label="🔗 Model URL",
                        value=DEFAULT_MODEL,
                        placeholder="Qwen/Qwen3-0.6B"
                    )
                    burn_hier = gr.Checkbox(value=True, label="Hierarchical Retention")
                    burn_name = gr.Textbox(label="💾 Output Name", placeholder="my_model")
                    
                    gr.Markdown("---")
                    gr.Markdown(f"### 🆕 Fine-tuning ({NUM_GPUS} GPUs)")
                    
                    burn_ft_enable = gr.Checkbox(
                        value=False, 
                        label="🚀 Enable Fine-tuning (Brumby-style)",
                        info=f"Multi-GPU acceleration with {NUM_GPUS} GPUs!"
                    )
                    
                    burn_ft_steps = gr.Slider(
                        1000, 10000, 3000,
                        step=100,
                        label="Steps",
                        visible=False
                    )
                    
                    burn_ft_batch = gr.Slider(
                        1, 16, 4, 
                        step=1, 
                        label=f"Batch Size per GPU ({NUM_GPUS} GPUs)", 
                        visible=False
                    )
                    burn_ft_lr = gr.Number(value=1e-5, label="Learning Rate", visible=False)
                    
                    burn_grad_ckpt = gr.Checkbox(
                        value=True, 
                        label="✅ Gradient Checkpointing (saves memory)",
                        visible=False
                    )
                    
                    def toggle_ft(enabled):
                        return [
                            gr.update(visible=enabled),
                            gr.update(visible=enabled),
                            gr.update(visible=enabled),
                            gr.update(visible=enabled),
                        ]
                    
                    burn_ft_enable.change(
                        toggle_ft,
                        [burn_ft_enable],
                        [burn_ft_steps, burn_ft_batch, burn_ft_lr, burn_grad_ckpt]
                    )
                    
                    gr.Markdown("---")
                    gr.Markdown("### 🌐 Hub Upload")
                    
                    burn_upload = gr.Checkbox(value=True, label="📤 Upload to Hub")
                    burn_repo = gr.Textbox(label="📦 Repo Name (optional)")
                    burn_private = gr.Checkbox(value=True, label="🔒 Private")
                    
                    burn_btn = gr.Button("🔥 Burn Model", variant="primary", size="lg")
                
                with gr.Column(scale=2):
                    burn_output = gr.Markdown()
                    burn_plot = gr.Plot()
            
            burn_btn.click(
                burn_phoenix_model_ui,
                [
                    burn_url, burn_hier, burn_name,
                    burn_ft_enable, burn_ft_steps, burn_ft_batch, burn_ft_lr, burn_grad_ckpt,
                    burn_upload, burn_repo, burn_private
                ],
                [burn_output, burn_plot]
            )
        
        with gr.Tab("📊 History"):
            with gr.Row():
                with gr.Column(scale=1):
                    hist_btn = gr.Button("📊 Load", variant="primary")
                with gr.Column(scale=2):
                    hist_out = gr.Markdown()
                    hist_plot = gr.Plot()
            
            hist_btn.click(view_history, outputs=[hist_out, hist_plot])
    
    gr.Markdown(f"""
    ---
    
    ## 🔥 PHOENIX v2.0 Multi-GPU
    
    **Hardware**: {NUM_GPUS} x {torch.cuda.get_device_name(0) if NUM_GPUS > 0 else 'N/A'}
    
    **Features**:
    - 🆕 Multi-GPU Training (DDP)
    - 🆕 Gradient Checkpointing
    - 🆕 H100 Optimized (fused optimizer)
    - 🆕 Brumby-style Fine-tuning
    - ✅ All v1.4.3 Fixes
    
    **Token**: {'✅' if HF_TOKEN else '❌ Not Found'}  
    **VIDraft AI Research Lab** | PHOENIX v2.0 Multi-GPU
    """)


if __name__ == "__main__":
    import argparse
    
    parser = argparse.ArgumentParser(description='PHOENIX v2.0 Multi-GPU')
    parser.add_argument('--port', type=int, default=None, help='Server port (default: auto find 7860-7960)')
    parser.add_argument('--share', action='store_true', help='Create public Gradio link')
    parser.add_argument('--host', type=str, default="0.0.0.0", help='Server host')
    args = parser.parse_args()
    
    demo.queue(max_size=20)
    
    # 포트 자동 찾기
    if args.port is None:
        # 7860부터 7960까지 시도
        for port in range(7860, 7960):
            try:
                demo.launch(
                    server_name=args.host,
                    server_port=port,
                    share=args.share,
                    show_error=True
                )
                break
            except OSError:
                continue
    else:
        demo.launch(
            server_name=args.host,
            server_port=args.port,
            share=args.share,
            show_error=True
        )