Spaces:

kouki321
/

Third_Try_Cag

Sleeping

App Files Files Community

kouki321 commited on Apr 7

Commit

7858597

verified ·

1 Parent(s): d73fe5d

Create app.py

Browse files

Files changed (1) hide show

app.py +87 -0

app.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from transformers.cache_utils import DynamicCache
+import os
+# Minimal generate function for token-by-token generation
+def generate(model, input_ids: torch.Tensor, past_key_values, max_new_tokens: int = 50) -> torch.Tensor:
+    device = model.model.embed_tokens.weight.device
+    origin_len = input_ids.shape[-1]
+    input_ids = input_ids.to(device)
+    output_ids = input_ids.clone()
+    next_token = input_ids
+    with torch.no_grad():
+        for _ in range(max_new_tokens):
+            out = model(
+                input_ids=next_token,
+                past_key_values=past_key_values,
+                use_cache=True
+            )
+            logits = out.logits[:, -1, :]
+            token = torch.argmax(logits, dim=-1, keepdim=True)
+            output_ids = torch.cat([output_ids, token], dim=-1)
+            past_key_values = out.past_key_values
+            next_token = token.to(device)
+            if model.config.eos_token_id is not None and token.item() == model.config.eos_token_id:
+                break
+    # Return just the newly generated part
+    return output_ids[:, origin_len:]
+torch.serialization.add_safe_globals([DynamicCache])
+torch.serialization.add_safe_globals([set])
+def get_kv_cache(model, tokenizer, prompt: str) -> DynamicCache:
+    # Encode prompt
+    device = model.model.embed_tokens.weight.device
+    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
+    cache = DynamicCache()# it grows as text is generated
+    #Run the model to populate the KV cache:
+    with torch.no_grad():
+        _ = model(
+            input_ids=input_ids,
+            past_key_values=cache,
+            use_cache=True
+        )
+    return cache
+def clean_up(cache: DynamicCache, origin_len: int):
+    # Remove any tokens appended to the original knowledge
+    for i in range(len(cache.key_cache)):
+        cache.key_cache[i] = cache.key_cache[i][:, :, :origin_len, :]
+        cache.value_cache[i] = cache.value_cache[i][:, :, :origin_len, :]
+model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
+tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                          #token=HF_TOKEN,
+                                          trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+    device_map="auto",
+    trust_remote_code=True,
+   # token=HF_TOKEN
+)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model.to(device)
+print(f"Loaded {model_name}.")
+if not os.path.exists("/kaggle/input/delice/delice.txt"):
+    raise FileNotFoundError("Please create a `document.txt` .")
+with open("/kaggle/input/delice/delice.txt", "r", encoding="utf-8") as f:
+    doc_text = f.read()
+system_prompt = f"""
+<|system|>
+Answer concisely and precisely, You are an assistant who provides concise factual answers.
+<|user|>
+Context:
+{doc_text}
+Question:
+""".strip()
+# Build the cache
+ronan_cache = get_kv_cache(model, tokenizer, system_prompt)
+torch.save(ronan_cache, "/kaggle/working/ronan_caches.pth")
+origin_len = ronan_cache.key_cache[0].shape[-2]
+print("KV cache built.")