Spaces:

rajeshlion
/

ask-baba-bhAIro

Running

App Files Files Community

ask-baba-bhAIro / app.py

rajeshlion

Update app.py

6e17d14 verified 3 months ago

raw

history blame

21.9 kB

	# import gradio as gr
	# from huggingface_hub import InferenceClient
	# import spaces



	# """
	# For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
	# """
	# client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")

	# @spaces.GPU
	# def respond(
	# message,
	# history: list[tuple[str, str]],
	# system_message,
	# max_tokens,
	# temperature,
	# top_p,
	# ):
	# messages = [{"role": "system", "content": system_message}]

	# for val in history:
	# if val[0]:
	# messages.append({"role": "user", "content": val[0]})
	# if val[1]:
	# messages.append({"role": "assistant", "content": val[1]})

	# messages.append({"role": "user", "content": message})

	# response = ""

	# for message in client.chat_completion(
	# messages,
	# max_tokens=max_tokens,
	# stream=True,
	# temperature=temperature,
	# top_p=top_p,
	# ):
	# token = message.choices[0].delta.content

	# response += token
	# yield response

	# """
	# For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
	# """
	# demo = gr.ChatInterface(
	# respond,
	# additional_inputs=[
	# gr.Textbox(value="You are a Chatbot who only answers spiritual questions based on Indian scriptures and declines answering other questions.", label="System message"),
	# gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
	# gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
	# gr.Slider(
	# minimum=0.1,
	# maximum=1.0,
	# value=0.95,
	# step=0.05,
	# label="Top-p (nucleus sampling)",
	# ),
	# ],

	# )


	# if __name__ == "__main__":
	# demo.launch()

	# import os
	# import gradio as gr
	# from huggingface_hub import InferenceClient
	# from huggingface_hub.utils import HfHubHTTPError

	# MODEL_ID = "HuggingFaceH4/zephyr-7b-beta"
	# HF_TOKEN = os.getenv("HF_TOKEN") # ⚠️ set this in Spaces → Settings → Secrets

	# client = InferenceClient(model=MODEL_ID, token=HF_TOKEN)



	# def _build_zephyr_prompt(system_message: str, history, user_msg: str) -> str:
	# parts = []
	# if system_message:
	# parts.append(f"<\|system\|>\n{system_message}\n</s>")
	# for u, a in (history or []):
	# if u:
	# parts.append(f"<\|user\|>\n{u}\n</s>")
	# if a:
	# parts.append(f"<\|assistant\|>\n{a}\n</s>")
	# parts.append(f"<\|user\|>\n{user_msg}\n</s>\n<\|assistant\|>\n")
	# return "\n".join(parts)


	# def respond(message, history, system_message, max_tokens, temperature, top_p):
	# # Early guardrails for missing token
	# if not HF_TOKEN:
	# yield (
	# "⚠️ Missing HF_TOKEN.\n\n"
	# "Set a Hugging Face access token in your Space:\n"
	# "Settings → Repository secrets → Add secret → Name: HF_TOKEN, Value: <your token>\n"
	# "Token needs at least 'read' scope."
	# )
	# return

	# # Try OpenAI-like chat completion first
	# try:
	# response_text = ""
	# for chunk in client.chat_completion(
	# messages=(
	# [{"role": "system", "content": system_message}] if system_message else []
	# )
	# + [
	# msg
	# for pair in (history or [])
	# for msg in (
	# [{"role": "user", "content": pair[0]}] if pair and pair[0] else []
	# )
	# + (
	# [{"role": "assistant", "content": pair[1]}]
	# if pair and len(pair) > 1 and pair[1]
	# else []
	# )
	# ]
	# + [{"role": "user", "content": message}],
	# max_tokens=max_tokens,
	# temperature=temperature,
	# top_p=top_p,
	# stream=True,
	# ):
	# token = getattr(chunk.choices[0].delta, "content", None)
	# if token:
	# response_text += token
	# yield response_text
	# return
	# except HfHubHTTPError as e:
	# # Handle 401 explicitly with helpful guidance
	# try:
	# status = e.response.status_code
	# except Exception:
	# status = None
	# if status == 401:
	# yield (
	# "❌ 401 Unauthorized from Hugging Face Inference API.\n\n"
	# "Fix:\n"
	# "1) Create a token at https://huggingface.co/settings/tokens with at least 'read' scope.\n"
	# "2) In your Space, go to Settings → Repository secrets → Add secret\n"
	# " Name: HF_TOKEN, Value: <your token>\n"
	# "3) Restart the Space.\n"
	# )
	# return
	# # Otherwise drop to fallback
	# except Exception:
	# pass

	# # Fallback: raw text_generation with Zephyr chat format
	# zephyr_prompt = _build_zephyr_prompt(system_message, history, message)
	# try:
	# response_text = ""
	# # for tok in client.text_generation(
	# # zephyr_prompt,
	# # max_new_tokens=max_tokens,
	# # temperature=temperature,
	# # top_p=top_p,
	# # stream=True,
	# # stop=["</s>", "<\|user\|>", "<\|assistant\|>", "<\|system\|>"],
	# # ):

	# for tok in client.text_generation(
	# zephyr_prompt,
	# max_new_tokens=max_tokens,
	# temperature=temperature,
	# top_p=top_p,
	# stream=True,
	# ):

	# if tok:
	# response_text += tok
	# yield response_text
	# except HfHubHTTPError as e:
	# try:
	# status = e.response.status_code
	# except Exception:
	# status = None
	# if status == 401:
	# yield (
	# "❌ 401 Unauthorized (text_generation fallback).\n\n"
	# "Set HF_TOKEN in Space secrets (Settings → Repository secrets)."
	# )
	# else:
	# yield f"[Inference error] {e}"
	# except Exception as e:
	# yield f"[Runtime error] {e}"


	# demo = gr.ChatInterface(
	# respond,
	# additional_inputs=[
	# gr.Textbox(
	# value=(
	# "You are a Chatbot who only answers spiritual questions based "
	# "on Indian scriptures and declines answering other questions."
	# ),
	# label="System message",
	# ),
	# gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
	# gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
	# gr.Slider(
	# minimum=0.1,
	# maximum=1.0,
	# value=0.95,
	# step=0.05,
	# label="Top-p (nucleus sampling)",
	# ),
	# ],
	# )

	# if __name__ == "__main__":
	# demo.launch()

	# import os
	# import gradio as gr
	# from huggingface_hub import InferenceClient
	# from huggingface_hub.utils import HfHubHTTPError # correct import for 0.22.x

	# # You can override with a Space secret: MODEL_ID=<your preferred model>
	# PREFERRED = os.getenv("MODEL_ID", "HuggingFaceH4/zephyr-7b-beta")

	# # Accept either token name (matches your other Spaces)
	# HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")

	# # If your preferred endpoint is down, we’ll try these in order:
	# CANDIDATES = [
	# PREFERRED,
	# "google/gemma-2-2b-it",
	# "Qwen/Qwen2.5-1.5B-Instruct",
	# "tiiuae/falcon-7b-instruct",
	# ]

	# def _build_generic_prompt(system_message, history, user_msg):
	# """
	# Simple, model-agnostic chat prompt (works across many instruct models).
	# """
	# parts = []
	# if system_message:
	# parts.append(f"System: {system_message}")
	# for u, a in (history or []):
	# if u:
	# parts.append(f"User: {u}")
	# if a:
	# parts.append(f"Assistant: {a}")
	# parts.append(f"User: {user_msg}")
	# parts.append("Assistant:")
	# return "\n".join(parts)

	# def _try_model(model_id, system_message, history, message, max_tokens, temperature, top_p):
	# """
	# Try streaming via chat_completions; on failure, fall back to text_generation.
	# Returns a generator that yields text chunks.
	# Raises ValueError('NEXT') to indicate “try next model”.
	# """
	# client = InferenceClient(model=model_id, token=HF_TOKEN) if HF_TOKEN else InferenceClient(model=model_id)

	# # 1) Try chat-completions (if supported by the backend)
	# try:
	# msgs = (
	# [{"role": "system", "content": system_message}] if system_message else []
	# )
	# for u, a in (history or []):
	# if u:
	# msgs.append({"role": "user", "content": u})
	# if a:
	# msgs.append({"role": "assistant", "content": a})
	# msgs.append({"role": "user", "content": message})

	# def gen_chat():
	# response_text = ""
	# for chunk in client.chat_completion(
	# messages=msgs,
	# max_tokens=max_tokens,
	# temperature=temperature,
	# top_p=top_p,
	# stream=True,
	# ):
	# token = getattr(chunk.choices[0].delta, "content", None)
	# if token:
	# response_text += token
	# yield response_text

	# # sanity probe: start the generator and yield progressively
	# for out in gen_chat():
	# yield out
	# return
	# except HfHubHTTPError as e:
	# status = getattr(e, "response", None).status_code if getattr(e, "response", None) else None
	# if status == 404:
	# # Endpoint not available for this model → try next candidate
	# raise ValueError("NEXT")
	# if status == 401:
	# yield (
	# "❌ 401 Unauthorized from HF Inference API.\n\n"
	# "Ensure a read-scoped token is set (HF_TOKEN or HUGGINGFACEHUB_API_TOKEN) "
	# "in Space secrets, then restart."
	# )
	# return
	# if status == 403:
	# yield (
	# "❌ 403 Forbidden from HF Inference API.\n\n"
	# "This model likely requires Inference Providers + billing on your token. "
	# "Either enable those or switch to a free hosted model using the MODEL_ID secret."
	# )
	# return
	# # fall through to text_generation for other statuses
	# except Exception:
	# # fall through to text_generation
	# pass

	# # 2) Fallback: plain text_generation with a generic prompt
	# prompt = _build_generic_prompt(system_message, history, message)
	# try:
	# response_text = ""
	# for tok in client.text_generation(
	# prompt,
	# max_new_tokens=max_tokens,
	# temperature=temperature,
	# top_p=top_p,
	# stream=True,
	# ):
	# # Manual stop filtering (since huggingface_hub==0.22.x lacks 'stop' kwarg)
	# if any(s in tok for s in ["</s>", "<\|user\|>", "<\|assistant\|>", "<\|system\|>"]):
	# break
	# if tok:
	# response_text += tok
	# yield response_text
	# except HfHubHTTPError as e:
	# status = getattr(e, "response", None).status_code if getattr(e, "response", None) else None
	# if status == 404:
	# # Endpoint not available for this model → try next candidate
	# raise ValueError("NEXT")
	# if status == 401:
	# yield (
	# "❌ 401 Unauthorized (text-generation fallback).\n\n"
	# "Set HF_TOKEN or HUGGINGFACEHUB_API_TOKEN in Space secrets and restart."
	# )
	# elif status == 403:
	# yield (
	# "❌ 403 Forbidden (text-generation fallback).\n\n"
	# "Your token lacks 'Use Inference API/Providers' or billing is not enabled. "
	# "Enable those or use a free hosted model via MODEL_ID."
	# )
	# else:
	# yield f"[Inference error] {e}"
	# except Exception as e:
	# yield f"[Runtime error] {e}"

	# def respond(message, history, system_message, max_tokens, temperature, top_p):
	# last_error = None
	# tried = []

	# for model_id in [m for m in CANDIDATES if m]:
	# tried.append(model_id)
	# try:
	# for chunk in _try_model(model_id, system_message, history, message, max_tokens, temperature, top_p):
	# yield chunk
	# # If we streamed anything without raising, we’re done
	# return
	# except ValueError as ve:
	# if str(ve) == "NEXT":
	# last_error = f"Model `{model_id}` endpoint unavailable (404)."
	# continue
	# else:
	# last_error = str(ve)
	# except Exception as e:
	# last_error = f"Unexpected error on `{model_id}`: {e}"

	# # If we got here, all candidates failed
	# tried_str = " → ".join(tried) if tried else "(none)"
	# yield (
	# "❌ All candidate models failed.\n\n"
	# f"Tried: {tried_str}\n\n"
	# f"Last error: {last_error or 'unknown'}\n\n"
	# "Fixes:\n"
	# "• Set MODEL_ID in Space secrets to a hosted model that’s online (e.g., google/gemma-2-2b-it, Qwen/Qwen2.5-1.5B-Instruct).\n"
	# "• Or enable Inference Providers + billing on your HF token for models served via providers.\n"
	# )

	# demo = gr.ChatInterface(
	# respond,
	# additional_inputs=[
	# gr.Textbox(
	# value=("You are a Chatbot who only answers spiritual questions based on Indian scriptures "
	# "and declines answering other questions."),
	# label="System message",
	# ),
	# gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
	# gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
	# gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
	# ],
	# )

	# if __name__ == "__main__":
	# demo.launch(share=True)




	import os
	import gradio as gr
	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM

	# You can override this via Space secret: MODEL_ID=Qwen/Qwen2-0.5B-Instruct (etc.)
	MODEL_ID = os.getenv("MODEL_ID", "TinyLlama/TinyLlama-1.1B-Chat-v1.0")

	# Load once at startup
	print(f"🔧 Loading local model: {MODEL_ID}")
	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_ID,
	torch_dtype=torch.float32, # CPU-friendly
	)
	model.eval()

	def build_prompt(system_message: str, history, user_msg: str) -> str:
	"""Try to use the model's chat template if present; otherwise use a generic prompt."""
	messages = []
	if system_message:
	messages.append({"role": "system", "content": system_message})
	for u, a in (history or []):
	if u:
	messages.append({"role": "user", "content": u})
	if a:
	messages.append({"role": "assistant", "content": a})
	messages.append({"role": "user", "content": user_msg})

	# Use chat template when available
	try:
	if getattr(tokenizer, "chat_template", None):
	return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	except Exception:
	pass

	# Fallback generic formatting
	parts = []
	if system_message:
	parts.append(f"System: {system_message}")
	for u, a in (history or []):
	if u:
	parts.append(f"User: {u}")
	if a:
	parts.append(f"Assistant: {a}")
	parts.append(f"User: {user_msg}")
	parts.append("Assistant:")
	return "\n".join(parts)

	def respond(message, history, system_message, max_tokens, temperature, top_p):
	prompt = build_prompt(system_message, history, message)
	inputs = tokenizer(prompt, return_tensors="pt")
	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=int(max_tokens),
	do_sample=True,
	temperature=float(temperature),
	top_p=float(top_p),
	pad_token_id=tokenizer.eos_token_id,
	eos_token_id=tokenizer.eos_token_id,
	)
	# Decode only the newly generated portion
	gen_ids = outputs[0][inputs["input_ids"].shape[1]:]
	text = tokenizer.decode(gen_ids, skip_special_tokens=True)

	# Stream the text in chunks so the UI feels live
	acc = ""
	for i in range(0, len(text), 40):
	acc += text[i:i+40]
	yield acc

	demo = gr.ChatInterface(
	respond,
	additional_inputs=[
	gr.Textbox(
	value=("You are a spiritual assistant who only answers spiritual questions based on Indian Hindu scriptures e.g., Bhagvadgita, and politely decline all other questions."),
	label="System message",
	),
	gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"),
	gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
	gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
	],
	)

	if __name__ == "__main__":
	# share=True gives you a public link automatically
	demo.launch(share=True)

	# import os
	# import gradio as gr

	# # ---- llama.cpp backend (fast CPU) ----
	# from llama_cpp import Llama

	# # ---- to list files in a repo and pick a GGUF automatically ----
	# from huggingface_hub import list_repo_files

	# # ----------------- Config -----------------
	# # You can override these via Space "Settings → Variables"
	# # If MODEL_REPO is set, it's tried first; otherwise we try the CANDIDATE_REPOS below.
	# MODEL_REPO = os.getenv("MODEL_REPO", "").strip() or None

	# # Known small GGUF chat repos (fast & lightweight). We'll try them in order.
	# CANDIDATE_REPOS = [
	# MODEL_REPO, # user-preferred first (may be None)
	# "Qwen/Qwen2.5-0.5B-Instruct-GGUF",
	# "Qwen/Qwen2-0.5B-Instruct-GGUF",
	# "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
	# "bartowski/Qwen2.5-0.5B-Instruct-GGUF",
	# ]

	# # Best-to-worst file name patterns to prefer when multiple GGUFs are present.
	# PREFERRED_PATTERNS = [
	# "q4_k_m.gguf", "Q4_K_M.gguf",
	# "q4_0.gguf", "Q4_0.gguf",
	# "q5_k_m.gguf", "Q5_K_M.gguf",
	# ".gguf", # catch-all
	# ]

	# # Runtime knobs
	# N_THREADS = int(os.getenv("N_THREADS", str(os.cpu_count() or 4)))
	# CTX = int(os.getenv("CTX", "2048"))

	# SYSTEM_DEFAULT = (
	# "You are a Chatbot who only answers spiritual questions based on Indian scriptures "
	# "and politely decline other questions."
	# )

	# # --------------- GGUF Picker ---------------
	# def pick_repo_and_file():
	# """Return (repo_id, gguf_filename) by scanning candidate repos for a preferred GGUF."""
	# tried = []
	# for repo in [r for r in CANDIDATE_REPOS if r]: # drop None
	# try:
	# files = list_repo_files(repo)
	# except Exception:
	# tried.append(f"{repo} (list failed)")
	# continue
	# ggufs = [f for f in files if f.lower().endswith(".gguf")]
	# if not ggufs:
	# tried.append(f"{repo} (no .gguf)")
	# continue
	# # pick by pattern preference
	# for pat in PREFERRED_PATTERNS:
	# for f in ggufs:
	# if pat in f:
	# return repo, f
	# tried_str = " \| ".join(tried) if tried else "(none)"
	# raise RuntimeError(
	# "No GGUF file found in any candidate repo.\n"
	# f"Tried: {tried_str}\n"
	# "Tip: set MODEL_REPO to a GGUF repo like 'Qwen/Qwen2.5-0.5B-Instruct-GGUF' "
	# "or 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF'."
	# )

	# REPO_ID, FILENAME = pick_repo_and_file()
	# print(f"🔧 Loading GGUF from {REPO_ID}/{FILENAME} \| threads={N_THREADS}, ctx={CTX}")

	# llm = Llama.from_pretrained(
	# repo_id=REPO_ID,
	# filename=FILENAME,
	# n_ctx=CTX,
	# n_threads=N_THREADS,
	# n_gpu_layers=0, # CPU only
	# logits_all=False,
	# verbose=False,
	# )

	# def respond(message, history, system_message, max_tokens, temperature, top_p):
	# sysmsg = system_message or SYSTEM_DEFAULT
	# msgs = [{"role": "system", "content": sysmsg}]
	# for u, a in (history or []):
	# if u:
	# msgs.append({"role": "user", "content": u})
	# if a:
	# msgs.append({"role": "assistant", "content": a})
	# msgs.append({"role": "user", "content": message})

	# stream = llm.create_chat_completion(
	# messages=msgs,
	# temperature=float(temperature),
	# top_p=float(top_p),
	# max_tokens=int(max_tokens),
	# stream=True,
	# )
	# acc = ""
	# for chunk in stream:
	# delta = chunk["choices"][0]["delta"]
	# tok = delta.get("content", "")
	# if tok:
	# acc += tok
	# yield acc

	# demo = gr.ChatInterface(
	# respond,
	# additional_inputs=[
	# gr.Textbox(value=SYSTEM_DEFAULT, label="System message"),
	# gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"),
	# gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
	# gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
	# ],
	# )

	# if __name__ == "__main__":
	# print(f"🧵 Threads: {N_THREADS}")
	# demo.launch(share=True)