Spaces:

rajeshlion
/

ask-baba-bhAIro

Running

App Files Files Community

rajeshlion commited on Aug 31

Commit

4ab8ff5

verified ·

1 Parent(s): b91179c

Update app.py

Browse files

Files changed (1) hide show

app.py +186 -186

app.py CHANGED Viewed

@@ -403,86 +403,205 @@
 # import os
 # import gradio as gr
-# import torch
-# from transformers import AutoTokenizer, AutoModelForCausalLM
-# # You can override this via Space secret: MODEL_ID=Qwen/Qwen2-0.5B-Instruct (etc.)
-# MODEL_ID = os.getenv("MODEL_ID", "TinyLlama/TinyLlama-1.1B-Chat-v1.0")
-# # Load once at startup
-# print(f"🔧 Loading local model: {MODEL_ID}")
-# tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
-# model = AutoModelForCausalLM.from_pretrained(
-#     MODEL_ID,
-#     torch_dtype=torch.float32,     # CPU-friendly
 # )
-# model.eval()
-# def build_prompt(system_message: str, history, user_msg: str) -> str:
-#     """Try to use the model's chat template if present; otherwise use a generic prompt."""
-#     messages = []
-#     if system_message:
-#         messages.append({"role": "system", "content": system_message})
-#     for u, a in (history or []):
-#         if u:
-#             messages.append({"role": "user", "content": u})
-#         if a:
-#             messages.append({"role": "assistant", "content": a})
-#     messages.append({"role": "user", "content": user_msg})
-#     # Use chat template when available
-#     try:
-#         if getattr(tokenizer, "chat_template", None):
-#             return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-#     except Exception:
-#         pass
-#     # Fallback generic formatting
-#     parts = []
-#     if system_message:
-#         parts.append(f"System: {system_message}")
 #     for u, a in (history or []):
 #         if u:
-#             parts.append(f"User: {u}")
 #         if a:
-#             parts.append(f"Assistant: {a}")
-#     parts.append(f"User: {user_msg}")
-#     parts.append("Assistant:")
-#     return "\n".join(parts)
-# def respond(message, history, system_message, max_tokens, temperature, top_p):
-#     prompt = build_prompt(system_message, history, message)
-#     inputs = tokenizer(prompt, return_tensors="pt")
-#     with torch.no_grad():
-#         outputs = model.generate(
-#             **inputs,
-#             max_new_tokens=int(max_tokens),
-#             do_sample=True,
-#             temperature=float(temperature),
-#             top_p=float(top_p),
-#             pad_token_id=tokenizer.eos_token_id,
-#             eos_token_id=tokenizer.eos_token_id,
-#         )
-#     # Decode only the newly generated portion
-#     gen_ids = outputs[0][inputs["input_ids"].shape[1]:]
-#     text = tokenizer.decode(gen_ids, skip_special_tokens=True)
-#     # Stream the text in chunks so the UI feels live
 #     acc = ""
-#     for i in range(0, len(text), 40):
-#         acc += text[i:i+40]
-#         yield acc
 # demo = gr.ChatInterface(
 #     respond,
 #     additional_inputs=[
-#         gr.Textbox(
-#             value=("You are a Chatbot who only answers spiritual questions based on three religiousscriptures (a) Hindu - e.g.Bhagwadgita, (b) Jewish, e.g. Torah, (c) Christian, e.g., Bible"
-#                    ". You will ffer all three perspectives. You decline answering other questions that do not relate to spirituality."),
-#             label="System message",
-#         ),
 #         gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"),
 #         gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
 #         gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
@@ -490,126 +609,7 @@
 # )
 # if __name__ == "__main__":
-#     # share=True gives you a public link automatically
 #     demo.launch(share=True)
-import os
-import gradio as gr
-# ---- llama.cpp backend (fast CPU) ----
-from llama_cpp import Llama
-# ---- to list files in a repo and pick a GGUF automatically ----
-from huggingface_hub import list_repo_files
-# ----------------- Config -----------------
-# You can override these via Space "Settings → Variables"
-# If MODEL_REPO is set, it's tried first; otherwise we try the CANDIDATE_REPOS below.
-MODEL_REPO = os.getenv("MODEL_REPO", "").strip() or None
-# Known small GGUF chat repos (fast & lightweight). We'll try them in order.
-CANDIDATE_REPOS = [
-    MODEL_REPO,  # user-preferred first (may be None)
-    "Qwen/Qwen2.5-0.5B-Instruct-GGUF",
-    "Qwen/Qwen2-0.5B-Instruct-GGUF",
-    "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
-    "bartowski/Qwen2.5-0.5B-Instruct-GGUF",
-]
-# Best-to-worst file name patterns to prefer when multiple GGUFs are present.
-PREFERRED_PATTERNS = [
-    "q4_k_m.gguf", "Q4_K_M.gguf",
-    "q4_0.gguf",   "Q4_0.gguf",
-    "q5_k_m.gguf", "Q5_K_M.gguf",
-    ".gguf",  # catch-all
-]
-# Runtime knobs
-N_THREADS = int(os.getenv("N_THREADS", str(os.cpu_count() or 4)))
-CTX = int(os.getenv("CTX", "2048"))
-SYSTEM_DEFAULT = (
-    "You are a Chatbot who only answers spiritual questions based on Indian scriptures "
-    "and politely decline other questions."
-)
-# --------------- GGUF Picker ---------------
-def pick_repo_and_file():
-    """Return (repo_id, gguf_filename) by scanning candidate repos for a preferred GGUF."""
-    tried = []
-    for repo in [r for r in CANDIDATE_REPOS if r]:  # drop None
-        try:
-            files = list_repo_files(repo)
-        except Exception:
-            tried.append(f"{repo} (list failed)")
-            continue
-        ggufs = [f for f in files if f.lower().endswith(".gguf")]
-        if not ggufs:
-            tried.append(f"{repo} (no .gguf)")
-            continue
-        # pick by pattern preference
-        for pat in PREFERRED_PATTERNS:
-            for f in ggufs:
-                if pat in f:
-                    return repo, f
-    tried_str = " | ".join(tried) if tried else "(none)"
-    raise RuntimeError(
-        "No GGUF file found in any candidate repo.\n"
-        f"Tried: {tried_str}\n"
-        "Tip: set MODEL_REPO to a GGUF repo like 'Qwen/Qwen2.5-0.5B-Instruct-GGUF' "
-        "or 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF'."
-    )
-REPO_ID, FILENAME = pick_repo_and_file()
-print(f"🔧 Loading GGUF from {REPO_ID}/{FILENAME}  |  threads={N_THREADS}, ctx={CTX}")
-llm = Llama.from_pretrained(
-    repo_id=REPO_ID,
-    filename=FILENAME,
-    n_ctx=CTX,
-    n_threads=N_THREADS,
-    n_gpu_layers=0,       # CPU only
-    logits_all=False,
-    verbose=False,
-)
-def respond(message, history, system_message, max_tokens, temperature, top_p):
-    sysmsg = system_message or SYSTEM_DEFAULT
-    msgs = [{"role": "system", "content": sysmsg}]
-    for u, a in (history or []):
-        if u:
-            msgs.append({"role": "user", "content": u})
-        if a:
-            msgs.append({"role": "assistant", "content": a})
-    msgs.append({"role": "user", "content": message})
-    stream = llm.create_chat_completion(
-        messages=msgs,
-        temperature=float(temperature),
-        top_p=float(top_p),
-        max_tokens=int(max_tokens),
-        stream=True,
-    )
-    acc = ""
-    for chunk in stream:
-        delta = chunk["choices"][0]["delta"]
-        tok = delta.get("content", "")
-        if tok:
-            acc += tok
-            yield acc
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value=SYSTEM_DEFAULT, label="System message"),
-        gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
-    ],
-)
-if __name__ == "__main__":
-    print(f"🧵 Threads: {N_THREADS}")
-    demo.launch(share=True)

+import os
+import gradio as gr
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+# You can override this via Space secret: MODEL_ID=Qwen/Qwen2-0.5B-Instruct (etc.)
+MODEL_ID = os.getenv("MODEL_ID", "TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+# Load once at startup
+print(f"🔧 Loading local model: {MODEL_ID}")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    torch_dtype=torch.float32,     # CPU-friendly
+)
+model.eval()
+def build_prompt(system_message: str, history, user_msg: str) -> str:
+    """Try to use the model's chat template if present; otherwise use a generic prompt."""
+    messages = []
+    if system_message:
+        messages.append({"role": "system", "content": system_message})
+    for u, a in (history or []):
+        if u:
+            messages.append({"role": "user", "content": u})
+        if a:
+            messages.append({"role": "assistant", "content": a})
+    messages.append({"role": "user", "content": user_msg})
+    # Use chat template when available
+    try:
+        if getattr(tokenizer, "chat_template", None):
+            return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    except Exception:
+        pass
+    # Fallback generic formatting
+    parts = []
+    if system_message:
+        parts.append(f"System: {system_message}")
+    for u, a in (history or []):
+        if u:
+            parts.append(f"User: {u}")
+        if a:
+            parts.append(f"Assistant: {a}")
+    parts.append(f"User: {user_msg}")
+    parts.append("Assistant:")
+    return "\n".join(parts)
+def respond(message, history, system_message, max_tokens, temperature, top_p):
+    prompt = build_prompt(system_message, history, message)
+    inputs = tokenizer(prompt, return_tensors="pt")
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=int(max_tokens),
+            do_sample=True,
+            temperature=float(temperature),
+            top_p=float(top_p),
+            pad_token_id=tokenizer.eos_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+        )
+    # Decode only the newly generated portion
+    gen_ids = outputs[0][inputs["input_ids"].shape[1]:]
+    text = tokenizer.decode(gen_ids, skip_special_tokens=True)
+    # Stream the text in chunks so the UI feels live
+    acc = ""
+    for i in range(0, len(text), 40):
+        acc += text[i:i+40]
+        yield acc
+demo = gr.ChatInterface(
+    respond,
+    additional_inputs=[
+        gr.Textbox(
+            value=("You are a Chatbot who only answers spiritual questions based on three religiousscriptures (a) Hindu - e.g.Bhagwadgita, (b) Jewish, e.g. Torah, (c) Christian, e.g., Bible"
+                   ". You will ffer all three perspectives. You decline answering other questions that do not relate to spirituality."),
+            label="System message",
+        ),
+        gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"),
+        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
+        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
+    ],
+)
+if __name__ == "__main__":
+    # share=True gives you a public link automatically
+    demo.launch(share=True)
 # import os
 # import gradio as gr
+# # ---- llama.cpp backend (fast CPU) ----
+# from llama_cpp import Llama
+# # ---- to list files in a repo and pick a GGUF automatically ----
+# from huggingface_hub import list_repo_files
+# # ----------------- Config -----------------
+# # You can override these via Space "Settings → Variables"
+# # If MODEL_REPO is set, it's tried first; otherwise we try the CANDIDATE_REPOS below.
+# MODEL_REPO = os.getenv("MODEL_REPO", "").strip() or None
+# # Known small GGUF chat repos (fast & lightweight). We'll try them in order.
+# CANDIDATE_REPOS = [
+#     MODEL_REPO,  # user-preferred first (may be None)
+#     "Qwen/Qwen2.5-0.5B-Instruct-GGUF",
+#     "Qwen/Qwen2-0.5B-Instruct-GGUF",
+#     "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
+#     "bartowski/Qwen2.5-0.5B-Instruct-GGUF",
+# ]
+# # Best-to-worst file name patterns to prefer when multiple GGUFs are present.
+# PREFERRED_PATTERNS = [
+#     "q4_k_m.gguf", "Q4_K_M.gguf",
+#     "q4_0.gguf",   "Q4_0.gguf",
+#     "q5_k_m.gguf", "Q5_K_M.gguf",
+#     ".gguf",  # catch-all
+# ]
+# # Runtime knobs
+# N_THREADS = int(os.getenv("N_THREADS", str(os.cpu_count() or 4)))
+# CTX = int(os.getenv("CTX", "2048"))
+# SYSTEM_DEFAULT = (
+#     "You are a Chatbot who only answers spiritual questions based on Indian scriptures "
+#     "and politely decline other questions."
 # )
+# # --------------- GGUF Picker ---------------
+# def pick_repo_and_file():
+#     """Return (repo_id, gguf_filename) by scanning candidate repos for a preferred GGUF."""
+#     tried = []
+#     for repo in [r for r in CANDIDATE_REPOS if r]:  # drop None
+#         try:
+#             files = list_repo_files(repo)
+#         except Exception:
+#             tried.append(f"{repo} (list failed)")
+#             continue
+#         ggufs = [f for f in files if f.lower().endswith(".gguf")]
+#         if not ggufs:
+#             tried.append(f"{repo} (no .gguf)")
+#             continue
+#         # pick by pattern preference
+#         for pat in PREFERRED_PATTERNS:
+#             for f in ggufs:
+#                 if pat in f:
+#                     return repo, f
+#     tried_str = " | ".join(tried) if tried else "(none)"
+#     raise RuntimeError(
+#         "No GGUF file found in any candidate repo.\n"
+#         f"Tried: {tried_str}\n"
+#         "Tip: set MODEL_REPO to a GGUF repo like 'Qwen/Qwen2.5-0.5B-Instruct-GGUF' "
+#         "or 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF'."
+#     )
+# REPO_ID, FILENAME = pick_repo_and_file()
+# print(f"🔧 Loading GGUF from {REPO_ID}/{FILENAME}  |  threads={N_THREADS}, ctx={CTX}")
+# llm = Llama.from_pretrained(
+#     repo_id=REPO_ID,
+#     filename=FILENAME,
+#     n_ctx=CTX,
+#     n_threads=N_THREADS,
+#     n_gpu_layers=0,       # CPU only
+#     logits_all=False,
+#     verbose=False,
+# )
+# def respond(message, history, system_message, max_tokens, temperature, top_p):
+#     sysmsg = system_message or SYSTEM_DEFAULT
+#     msgs = [{"role": "system", "content": sysmsg}]
 #     for u, a in (history or []):
 #         if u:
+#             msgs.append({"role": "user", "content": u})
 #         if a:
+#             msgs.append({"role": "assistant", "content": a})
+#     msgs.append({"role": "user", "content": message})
+#     stream = llm.create_chat_completion(
+#         messages=msgs,
+#         temperature=float(temperature),
+#         top_p=float(top_p),
+#         max_tokens=int(max_tokens),
+#         stream=True,
+#     )
 #     acc = ""
+#     for chunk in stream:
+#         delta = chunk["choices"][0]["delta"]
+#         tok = delta.get("content", "")
+#         if tok:
+#             acc += tok
+#             yield acc
 # demo = gr.ChatInterface(
 #     respond,
 #     additional_inputs=[
+#         gr.Textbox(value=SYSTEM_DEFAULT, label="System message"),
 #         gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"),
 #         gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
 #         gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
 # )
 # if __name__ == "__main__":
+#     print(f"🧵 Threads: {N_THREADS}")
 #     demo.launch(share=True)