Spaces:
Running
Running
| # import gradio as gr | |
| # from huggingface_hub import InferenceClient | |
| # import spaces | |
| # """ | |
| # For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference | |
| # """ | |
| # client = InferenceClient("HuggingFaceH4/zephyr-7b-beta") | |
| # @spaces.GPU | |
| # def respond( | |
| # message, | |
| # history: list[tuple[str, str]], | |
| # system_message, | |
| # max_tokens, | |
| # temperature, | |
| # top_p, | |
| # ): | |
| # messages = [{"role": "system", "content": system_message}] | |
| # for val in history: | |
| # if val[0]: | |
| # messages.append({"role": "user", "content": val[0]}) | |
| # if val[1]: | |
| # messages.append({"role": "assistant", "content": val[1]}) | |
| # messages.append({"role": "user", "content": message}) | |
| # response = "" | |
| # for message in client.chat_completion( | |
| # messages, | |
| # max_tokens=max_tokens, | |
| # stream=True, | |
| # temperature=temperature, | |
| # top_p=top_p, | |
| # ): | |
| # token = message.choices[0].delta.content | |
| # response += token | |
| # yield response | |
| # """ | |
| # For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface | |
| # """ | |
| # demo = gr.ChatInterface( | |
| # respond, | |
| # additional_inputs=[ | |
| # gr.Textbox(value="You are a Chatbot who only answers spiritual questions based on Indian scriptures and declines answering other questions.", label="System message"), | |
| # gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), | |
| # gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), | |
| # gr.Slider( | |
| # minimum=0.1, | |
| # maximum=1.0, | |
| # value=0.95, | |
| # step=0.05, | |
| # label="Top-p (nucleus sampling)", | |
| # ), | |
| # ], | |
| # ) | |
| # if __name__ == "__main__": | |
| # demo.launch() | |
| # import os | |
| # import gradio as gr | |
| # from huggingface_hub import InferenceClient | |
| # from huggingface_hub.utils import HfHubHTTPError | |
| # MODEL_ID = "HuggingFaceH4/zephyr-7b-beta" | |
| # HF_TOKEN = os.getenv("HF_TOKEN") # ⚠️ set this in Spaces → Settings → Secrets | |
| # client = InferenceClient(model=MODEL_ID, token=HF_TOKEN) | |
| # def _build_zephyr_prompt(system_message: str, history, user_msg: str) -> str: | |
| # parts = [] | |
| # if system_message: | |
| # parts.append(f"<|system|>\n{system_message}\n</s>") | |
| # for u, a in (history or []): | |
| # if u: | |
| # parts.append(f"<|user|>\n{u}\n</s>") | |
| # if a: | |
| # parts.append(f"<|assistant|>\n{a}\n</s>") | |
| # parts.append(f"<|user|>\n{user_msg}\n</s>\n<|assistant|>\n") | |
| # return "\n".join(parts) | |
| # def respond(message, history, system_message, max_tokens, temperature, top_p): | |
| # # Early guardrails for missing token | |
| # if not HF_TOKEN: | |
| # yield ( | |
| # "⚠️ Missing HF_TOKEN.\n\n" | |
| # "Set a Hugging Face access token in your Space:\n" | |
| # "Settings → Repository secrets → Add secret → Name: HF_TOKEN, Value: <your token>\n" | |
| # "Token needs at least 'read' scope." | |
| # ) | |
| # return | |
| # # Try OpenAI-like chat completion first | |
| # try: | |
| # response_text = "" | |
| # for chunk in client.chat_completion( | |
| # messages=( | |
| # [{"role": "system", "content": system_message}] if system_message else [] | |
| # ) | |
| # + [ | |
| # msg | |
| # for pair in (history or []) | |
| # for msg in ( | |
| # [{"role": "user", "content": pair[0]}] if pair and pair[0] else [] | |
| # ) | |
| # + ( | |
| # [{"role": "assistant", "content": pair[1]}] | |
| # if pair and len(pair) > 1 and pair[1] | |
| # else [] | |
| # ) | |
| # ] | |
| # + [{"role": "user", "content": message}], | |
| # max_tokens=max_tokens, | |
| # temperature=temperature, | |
| # top_p=top_p, | |
| # stream=True, | |
| # ): | |
| # token = getattr(chunk.choices[0].delta, "content", None) | |
| # if token: | |
| # response_text += token | |
| # yield response_text | |
| # return | |
| # except HfHubHTTPError as e: | |
| # # Handle 401 explicitly with helpful guidance | |
| # try: | |
| # status = e.response.status_code | |
| # except Exception: | |
| # status = None | |
| # if status == 401: | |
| # yield ( | |
| # "❌ 401 Unauthorized from Hugging Face Inference API.\n\n" | |
| # "Fix:\n" | |
| # "1) Create a token at https://huggingface.co/settings/tokens with at least 'read' scope.\n" | |
| # "2) In your Space, go to Settings → Repository secrets → Add secret\n" | |
| # " Name: HF_TOKEN, Value: <your token>\n" | |
| # "3) Restart the Space.\n" | |
| # ) | |
| # return | |
| # # Otherwise drop to fallback | |
| # except Exception: | |
| # pass | |
| # # Fallback: raw text_generation with Zephyr chat format | |
| # zephyr_prompt = _build_zephyr_prompt(system_message, history, message) | |
| # try: | |
| # response_text = "" | |
| # # for tok in client.text_generation( | |
| # # zephyr_prompt, | |
| # # max_new_tokens=max_tokens, | |
| # # temperature=temperature, | |
| # # top_p=top_p, | |
| # # stream=True, | |
| # # stop=["</s>", "<|user|>", "<|assistant|>", "<|system|>"], | |
| # # ): | |
| # for tok in client.text_generation( | |
| # zephyr_prompt, | |
| # max_new_tokens=max_tokens, | |
| # temperature=temperature, | |
| # top_p=top_p, | |
| # stream=True, | |
| # ): | |
| # if tok: | |
| # response_text += tok | |
| # yield response_text | |
| # except HfHubHTTPError as e: | |
| # try: | |
| # status = e.response.status_code | |
| # except Exception: | |
| # status = None | |
| # if status == 401: | |
| # yield ( | |
| # "❌ 401 Unauthorized (text_generation fallback).\n\n" | |
| # "Set HF_TOKEN in Space secrets (Settings → Repository secrets)." | |
| # ) | |
| # else: | |
| # yield f"[Inference error] {e}" | |
| # except Exception as e: | |
| # yield f"[Runtime error] {e}" | |
| # demo = gr.ChatInterface( | |
| # respond, | |
| # additional_inputs=[ | |
| # gr.Textbox( | |
| # value=( | |
| # "You are a Chatbot who only answers spiritual questions based " | |
| # "on Indian scriptures and declines answering other questions." | |
| # ), | |
| # label="System message", | |
| # ), | |
| # gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), | |
| # gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), | |
| # gr.Slider( | |
| # minimum=0.1, | |
| # maximum=1.0, | |
| # value=0.95, | |
| # step=0.05, | |
| # label="Top-p (nucleus sampling)", | |
| # ), | |
| # ], | |
| # ) | |
| # if __name__ == "__main__": | |
| # demo.launch() | |
| # import os | |
| # import gradio as gr | |
| # from huggingface_hub import InferenceClient | |
| # from huggingface_hub.utils import HfHubHTTPError # correct import for 0.22.x | |
| # # You can override with a Space secret: MODEL_ID=<your preferred model> | |
| # PREFERRED = os.getenv("MODEL_ID", "HuggingFaceH4/zephyr-7b-beta") | |
| # # Accept either token name (matches your other Spaces) | |
| # HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN") | |
| # # If your preferred endpoint is down, we’ll try these in order: | |
| # CANDIDATES = [ | |
| # PREFERRED, | |
| # "google/gemma-2-2b-it", | |
| # "Qwen/Qwen2.5-1.5B-Instruct", | |
| # "tiiuae/falcon-7b-instruct", | |
| # ] | |
| # def _build_generic_prompt(system_message, history, user_msg): | |
| # """ | |
| # Simple, model-agnostic chat prompt (works across many instruct models). | |
| # """ | |
| # parts = [] | |
| # if system_message: | |
| # parts.append(f"System: {system_message}") | |
| # for u, a in (history or []): | |
| # if u: | |
| # parts.append(f"User: {u}") | |
| # if a: | |
| # parts.append(f"Assistant: {a}") | |
| # parts.append(f"User: {user_msg}") | |
| # parts.append("Assistant:") | |
| # return "\n".join(parts) | |
| # def _try_model(model_id, system_message, history, message, max_tokens, temperature, top_p): | |
| # """ | |
| # Try streaming via chat_completions; on failure, fall back to text_generation. | |
| # Returns a generator that yields text chunks. | |
| # Raises ValueError('NEXT') to indicate “try next model”. | |
| # """ | |
| # client = InferenceClient(model=model_id, token=HF_TOKEN) if HF_TOKEN else InferenceClient(model=model_id) | |
| # # 1) Try chat-completions (if supported by the backend) | |
| # try: | |
| # msgs = ( | |
| # [{"role": "system", "content": system_message}] if system_message else [] | |
| # ) | |
| # for u, a in (history or []): | |
| # if u: | |
| # msgs.append({"role": "user", "content": u}) | |
| # if a: | |
| # msgs.append({"role": "assistant", "content": a}) | |
| # msgs.append({"role": "user", "content": message}) | |
| # def gen_chat(): | |
| # response_text = "" | |
| # for chunk in client.chat_completion( | |
| # messages=msgs, | |
| # max_tokens=max_tokens, | |
| # temperature=temperature, | |
| # top_p=top_p, | |
| # stream=True, | |
| # ): | |
| # token = getattr(chunk.choices[0].delta, "content", None) | |
| # if token: | |
| # response_text += token | |
| # yield response_text | |
| # # sanity probe: start the generator and yield progressively | |
| # for out in gen_chat(): | |
| # yield out | |
| # return | |
| # except HfHubHTTPError as e: | |
| # status = getattr(e, "response", None).status_code if getattr(e, "response", None) else None | |
| # if status == 404: | |
| # # Endpoint not available for this model → try next candidate | |
| # raise ValueError("NEXT") | |
| # if status == 401: | |
| # yield ( | |
| # "❌ 401 Unauthorized from HF Inference API.\n\n" | |
| # "Ensure a read-scoped token is set (HF_TOKEN or HUGGINGFACEHUB_API_TOKEN) " | |
| # "in Space secrets, then restart." | |
| # ) | |
| # return | |
| # if status == 403: | |
| # yield ( | |
| # "❌ 403 Forbidden from HF Inference API.\n\n" | |
| # "This model likely requires Inference Providers + billing on your token. " | |
| # "Either enable those or switch to a free hosted model using the MODEL_ID secret." | |
| # ) | |
| # return | |
| # # fall through to text_generation for other statuses | |
| # except Exception: | |
| # # fall through to text_generation | |
| # pass | |
| # # 2) Fallback: plain text_generation with a generic prompt | |
| # prompt = _build_generic_prompt(system_message, history, message) | |
| # try: | |
| # response_text = "" | |
| # for tok in client.text_generation( | |
| # prompt, | |
| # max_new_tokens=max_tokens, | |
| # temperature=temperature, | |
| # top_p=top_p, | |
| # stream=True, | |
| # ): | |
| # # Manual stop filtering (since huggingface_hub==0.22.x lacks 'stop' kwarg) | |
| # if any(s in tok for s in ["</s>", "<|user|>", "<|assistant|>", "<|system|>"]): | |
| # break | |
| # if tok: | |
| # response_text += tok | |
| # yield response_text | |
| # except HfHubHTTPError as e: | |
| # status = getattr(e, "response", None).status_code if getattr(e, "response", None) else None | |
| # if status == 404: | |
| # # Endpoint not available for this model → try next candidate | |
| # raise ValueError("NEXT") | |
| # if status == 401: | |
| # yield ( | |
| # "❌ 401 Unauthorized (text-generation fallback).\n\n" | |
| # "Set HF_TOKEN or HUGGINGFACEHUB_API_TOKEN in Space secrets and restart." | |
| # ) | |
| # elif status == 403: | |
| # yield ( | |
| # "❌ 403 Forbidden (text-generation fallback).\n\n" | |
| # "Your token lacks 'Use Inference API/Providers' or billing is not enabled. " | |
| # "Enable those or use a free hosted model via MODEL_ID." | |
| # ) | |
| # else: | |
| # yield f"[Inference error] {e}" | |
| # except Exception as e: | |
| # yield f"[Runtime error] {e}" | |
| # def respond(message, history, system_message, max_tokens, temperature, top_p): | |
| # last_error = None | |
| # tried = [] | |
| # for model_id in [m for m in CANDIDATES if m]: | |
| # tried.append(model_id) | |
| # try: | |
| # for chunk in _try_model(model_id, system_message, history, message, max_tokens, temperature, top_p): | |
| # yield chunk | |
| # # If we streamed anything without raising, we’re done | |
| # return | |
| # except ValueError as ve: | |
| # if str(ve) == "NEXT": | |
| # last_error = f"Model `{model_id}` endpoint unavailable (404)." | |
| # continue | |
| # else: | |
| # last_error = str(ve) | |
| # except Exception as e: | |
| # last_error = f"Unexpected error on `{model_id}`: {e}" | |
| # # If we got here, all candidates failed | |
| # tried_str = " → ".join(tried) if tried else "(none)" | |
| # yield ( | |
| # "❌ All candidate models failed.\n\n" | |
| # f"Tried: {tried_str}\n\n" | |
| # f"Last error: {last_error or 'unknown'}\n\n" | |
| # "Fixes:\n" | |
| # "• Set MODEL_ID in Space secrets to a hosted model that’s online (e.g., google/gemma-2-2b-it, Qwen/Qwen2.5-1.5B-Instruct).\n" | |
| # "• Or enable Inference Providers + billing on your HF token for models served via providers.\n" | |
| # ) | |
| # demo = gr.ChatInterface( | |
| # respond, | |
| # additional_inputs=[ | |
| # gr.Textbox( | |
| # value=("You are a Chatbot who only answers spiritual questions based on Indian scriptures " | |
| # "and declines answering other questions."), | |
| # label="System message", | |
| # ), | |
| # gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), | |
| # gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), | |
| # gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"), | |
| # ], | |
| # ) | |
| # if __name__ == "__main__": | |
| # demo.launch(share=True) | |
| import os | |
| import gradio as gr | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| # You can override this via Space secret: MODEL_ID=Qwen/Qwen2-0.5B-Instruct (etc.) | |
| MODEL_ID = os.getenv("MODEL_ID", "TinyLlama/TinyLlama-1.1B-Chat-v1.0") | |
| # Load once at startup | |
| print(f"🔧 Loading local model: {MODEL_ID}") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| torch_dtype=torch.float32, # CPU-friendly | |
| ) | |
| model.eval() | |
| def build_prompt(system_message: str, history, user_msg: str) -> str: | |
| """Try to use the model's chat template if present; otherwise use a generic prompt.""" | |
| messages = [] | |
| if system_message: | |
| messages.append({"role": "system", "content": system_message}) | |
| for u, a in (history or []): | |
| if u: | |
| messages.append({"role": "user", "content": u}) | |
| if a: | |
| messages.append({"role": "assistant", "content": a}) | |
| messages.append({"role": "user", "content": user_msg}) | |
| # Use chat template when available | |
| try: | |
| if getattr(tokenizer, "chat_template", None): | |
| return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | |
| except Exception: | |
| pass | |
| # Fallback generic formatting | |
| parts = [] | |
| if system_message: | |
| parts.append(f"System: {system_message}") | |
| for u, a in (history or []): | |
| if u: | |
| parts.append(f"User: {u}") | |
| if a: | |
| parts.append(f"Assistant: {a}") | |
| parts.append(f"User: {user_msg}") | |
| parts.append("Assistant:") | |
| return "\n".join(parts) | |
| def respond(message, history, system_message, max_tokens, temperature, top_p): | |
| prompt = build_prompt(system_message, history, message) | |
| inputs = tokenizer(prompt, return_tensors="pt") | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| **inputs, | |
| max_new_tokens=int(max_tokens), | |
| do_sample=True, | |
| temperature=float(temperature), | |
| top_p=float(top_p), | |
| pad_token_id=tokenizer.eos_token_id, | |
| eos_token_id=tokenizer.eos_token_id, | |
| ) | |
| # Decode only the newly generated portion | |
| gen_ids = outputs[0][inputs["input_ids"].shape[1]:] | |
| text = tokenizer.decode(gen_ids, skip_special_tokens=True) | |
| # Stream the text in chunks so the UI feels live | |
| acc = "" | |
| for i in range(0, len(text), 40): | |
| acc += text[i:i+40] | |
| yield acc | |
| demo = gr.ChatInterface( | |
| respond, | |
| additional_inputs=[ | |
| gr.Textbox( | |
| value=("You are a spiritual assistant who only answers spiritual questions based on Indian Hindu scriptures e.g., Bhagvadgita, and politely decline all other questions."), | |
| label="System message", | |
| ), | |
| gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"), | |
| gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), | |
| gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"), | |
| ], | |
| ) | |
| if __name__ == "__main__": | |
| # share=True gives you a public link automatically | |
| demo.launch(share=True) | |
| # import os | |
| # import gradio as gr | |
| # # ---- llama.cpp backend (fast CPU) ---- | |
| # from llama_cpp import Llama | |
| # # ---- to list files in a repo and pick a GGUF automatically ---- | |
| # from huggingface_hub import list_repo_files | |
| # # ----------------- Config ----------------- | |
| # # You can override these via Space "Settings → Variables" | |
| # # If MODEL_REPO is set, it's tried first; otherwise we try the CANDIDATE_REPOS below. | |
| # MODEL_REPO = os.getenv("MODEL_REPO", "").strip() or None | |
| # # Known small GGUF chat repos (fast & lightweight). We'll try them in order. | |
| # CANDIDATE_REPOS = [ | |
| # MODEL_REPO, # user-preferred first (may be None) | |
| # "Qwen/Qwen2.5-0.5B-Instruct-GGUF", | |
| # "Qwen/Qwen2-0.5B-Instruct-GGUF", | |
| # "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", | |
| # "bartowski/Qwen2.5-0.5B-Instruct-GGUF", | |
| # ] | |
| # # Best-to-worst file name patterns to prefer when multiple GGUFs are present. | |
| # PREFERRED_PATTERNS = [ | |
| # "q4_k_m.gguf", "Q4_K_M.gguf", | |
| # "q4_0.gguf", "Q4_0.gguf", | |
| # "q5_k_m.gguf", "Q5_K_M.gguf", | |
| # ".gguf", # catch-all | |
| # ] | |
| # # Runtime knobs | |
| # N_THREADS = int(os.getenv("N_THREADS", str(os.cpu_count() or 4))) | |
| # CTX = int(os.getenv("CTX", "2048")) | |
| # SYSTEM_DEFAULT = ( | |
| # "You are a Chatbot who only answers spiritual questions based on Indian scriptures " | |
| # "and politely decline other questions." | |
| # ) | |
| # # --------------- GGUF Picker --------------- | |
| # def pick_repo_and_file(): | |
| # """Return (repo_id, gguf_filename) by scanning candidate repos for a preferred GGUF.""" | |
| # tried = [] | |
| # for repo in [r for r in CANDIDATE_REPOS if r]: # drop None | |
| # try: | |
| # files = list_repo_files(repo) | |
| # except Exception: | |
| # tried.append(f"{repo} (list failed)") | |
| # continue | |
| # ggufs = [f for f in files if f.lower().endswith(".gguf")] | |
| # if not ggufs: | |
| # tried.append(f"{repo} (no .gguf)") | |
| # continue | |
| # # pick by pattern preference | |
| # for pat in PREFERRED_PATTERNS: | |
| # for f in ggufs: | |
| # if pat in f: | |
| # return repo, f | |
| # tried_str = " | ".join(tried) if tried else "(none)" | |
| # raise RuntimeError( | |
| # "No GGUF file found in any candidate repo.\n" | |
| # f"Tried: {tried_str}\n" | |
| # "Tip: set MODEL_REPO to a GGUF repo like 'Qwen/Qwen2.5-0.5B-Instruct-GGUF' " | |
| # "or 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF'." | |
| # ) | |
| # REPO_ID, FILENAME = pick_repo_and_file() | |
| # print(f"🔧 Loading GGUF from {REPO_ID}/{FILENAME} | threads={N_THREADS}, ctx={CTX}") | |
| # llm = Llama.from_pretrained( | |
| # repo_id=REPO_ID, | |
| # filename=FILENAME, | |
| # n_ctx=CTX, | |
| # n_threads=N_THREADS, | |
| # n_gpu_layers=0, # CPU only | |
| # logits_all=False, | |
| # verbose=False, | |
| # ) | |
| # def respond(message, history, system_message, max_tokens, temperature, top_p): | |
| # sysmsg = system_message or SYSTEM_DEFAULT | |
| # msgs = [{"role": "system", "content": sysmsg}] | |
| # for u, a in (history or []): | |
| # if u: | |
| # msgs.append({"role": "user", "content": u}) | |
| # if a: | |
| # msgs.append({"role": "assistant", "content": a}) | |
| # msgs.append({"role": "user", "content": message}) | |
| # stream = llm.create_chat_completion( | |
| # messages=msgs, | |
| # temperature=float(temperature), | |
| # top_p=float(top_p), | |
| # max_tokens=int(max_tokens), | |
| # stream=True, | |
| # ) | |
| # acc = "" | |
| # for chunk in stream: | |
| # delta = chunk["choices"][0]["delta"] | |
| # tok = delta.get("content", "") | |
| # if tok: | |
| # acc += tok | |
| # yield acc | |
| # demo = gr.ChatInterface( | |
| # respond, | |
| # additional_inputs=[ | |
| # gr.Textbox(value=SYSTEM_DEFAULT, label="System message"), | |
| # gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"), | |
| # gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), | |
| # gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"), | |
| # ], | |
| # ) | |
| # if __name__ == "__main__": | |
| # print(f"🧵 Threads: {N_THREADS}") | |
| # demo.launch(share=True) | |