Spaces:

Heartsync
/

phoenix

Paused

App Files Files Community

seawolf2357 commited on Nov 6

Commit

f42a5e2

verified ·

1 Parent(s): ca4042c

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -12

app.py CHANGED Viewed

@@ -25,7 +25,7 @@ import pandas as pd
 from typing import Dict, List, Any, Tuple, Optional
 import chromadb
 from chromadb.config import Settings
-from transformers import AutoModel, AutoTokenizer, AutoConfig
 import copy
 # =====================================================
@@ -693,21 +693,35 @@ def generate_text_phoenix(
         if not convert_attention or not model_url.strip():
             return "⚠️ Enable 'Attention Replace' and provide model URL", ""
-        # 1. 모델 변환
-        model_info, msg = convert_model_to_phoenix(model_url, use_hierarchical, "L40S")
-        if model_info is None:
-            return msg, ""
-        model = model_info['model']
-        # 2. Tokenizer 로드
         try:
             tokenizer = AutoTokenizer.from_pretrained(model_url, trust_remote_code=True)
         except Exception as e:
             return f"❌ Tokenizer load failed: {e}", ""
-        # 3. 입력 토크나이즈
         inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
         input_ids = inputs["input_ids"]
@@ -716,15 +730,17 @@ def generate_text_phoenix(
         print(f"   Input tokens: {input_ids.shape[1]}")
         print(f"   Max new tokens: {max_new_tokens}")
-        # 4. 생성
         start_time = time.time()
         generated_ids = []
         with torch.no_grad():
-            for _ in range(max_new_tokens):
-                # Forward pass
                 outputs = model(input_ids=input_ids)
-                logits = outputs.logits[:, -1, :]
                 # Temperature sampling
                 if temperature > 0:
@@ -739,7 +755,12 @@ def generate_text_phoenix(
                 # Stop at EOS
                 if next_token.item() == tokenizer.eos_token_id:
                     break
         elapsed = time.time() - start_time

 from typing import Dict, List, Any, Tuple, Optional
 import chromadb
 from chromadb.config import Settings
+from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForCausalLM
 import copy
 # =====================================================
         if not convert_attention or not model_url.strip():
             return "⚠️ Enable 'Attention Replace' and provide model URL", ""
+        # 1. ✅ CausalLM 모델 로드 (lm_head 포함)
+        print(f"📥 Loading CausalLM model: {model_url}")
+        config = AutoConfig.from_pretrained(model_url, trust_remote_code=True)
+        # Load full causal LM model
+        model = AutoModelForCausalLM.from_pretrained(
+            model_url,
+            trust_remote_code=True,
+            torch_dtype=torch.float16
+        ).to(DEVICE)
+        # 2. Attention → Retention 변환
+        print(f"🔄 Converting attention to retention...")
+        model.model, converted, total = replace_attention_with_retention(
+            model.model,  # Convert the base model, keep lm_head
+            use_hierarchical=use_hierarchical
+        )
+        print(f"✅ Converted {converted}/{total} layers")
+        # 3. Tokenizer 로드
         try:
             tokenizer = AutoTokenizer.from_pretrained(model_url, trust_remote_code=True)
+            if tokenizer.pad_token is None:
+                tokenizer.pad_token = tokenizer.eos_token
         except Exception as e:
             return f"❌ Tokenizer load failed: {e}", ""
+        # 4. 입력 토크나이즈
         inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
         input_ids = inputs["input_ids"]
         print(f"   Input tokens: {input_ids.shape[1]}")
         print(f"   Max new tokens: {max_new_tokens}")
+        # 5. 생성
         start_time = time.time()
         generated_ids = []
         with torch.no_grad():
+            for step in range(max_new_tokens):
+                # Forward pass (now with lm_head)
                 outputs = model(input_ids=input_ids)
+                # Get logits from lm_head
+                logits = outputs.logits[:, -1, :]  # [B, vocab_size]
                 # Temperature sampling
                 if temperature > 0:
                 # Stop at EOS
                 if next_token.item() == tokenizer.eos_token_id:
+                    print(f"   Stopped at EOS token")
                     break
+                # Progress
+                if (step + 1) % 10 == 0:
+                    print(f"   Generated {step + 1}/{max_new_tokens} tokens...")
         elapsed = time.time() - start_time