Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -403,86 +403,205 @@
|
|
| 403 |
|
| 404 |
|
| 405 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 406 |
# import os
|
| 407 |
# import gradio as gr
|
| 408 |
-
|
| 409 |
-
#
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
#
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
#
|
| 416 |
-
#
|
| 417 |
-
#
|
| 418 |
-
#
|
| 419 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 420 |
# )
|
| 421 |
-
# model.eval()
|
| 422 |
|
| 423 |
-
#
|
| 424 |
-
#
|
| 425 |
-
#
|
| 426 |
-
#
|
| 427 |
-
#
|
| 428 |
-
#
|
| 429 |
-
#
|
| 430 |
-
#
|
| 431 |
-
#
|
| 432 |
-
#
|
| 433 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 434 |
|
| 435 |
-
#
|
| 436 |
-
#
|
| 437 |
-
|
| 438 |
-
#
|
| 439 |
-
#
|
| 440 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 441 |
|
| 442 |
-
#
|
| 443 |
-
#
|
| 444 |
-
#
|
| 445 |
-
# parts.append(f"System: {system_message}")
|
| 446 |
# for u, a in (history or []):
|
| 447 |
# if u:
|
| 448 |
-
#
|
| 449 |
# if a:
|
| 450 |
-
#
|
| 451 |
-
#
|
| 452 |
-
|
| 453 |
-
#
|
| 454 |
-
|
| 455 |
-
#
|
| 456 |
-
#
|
| 457 |
-
#
|
| 458 |
-
#
|
| 459 |
-
#
|
| 460 |
-
# **inputs,
|
| 461 |
-
# max_new_tokens=int(max_tokens),
|
| 462 |
-
# do_sample=True,
|
| 463 |
-
# temperature=float(temperature),
|
| 464 |
-
# top_p=float(top_p),
|
| 465 |
-
# pad_token_id=tokenizer.eos_token_id,
|
| 466 |
-
# eos_token_id=tokenizer.eos_token_id,
|
| 467 |
-
# )
|
| 468 |
-
# # Decode only the newly generated portion
|
| 469 |
-
# gen_ids = outputs[0][inputs["input_ids"].shape[1]:]
|
| 470 |
-
# text = tokenizer.decode(gen_ids, skip_special_tokens=True)
|
| 471 |
-
|
| 472 |
-
# # Stream the text in chunks so the UI feels live
|
| 473 |
# acc = ""
|
| 474 |
-
# for
|
| 475 |
-
#
|
| 476 |
-
#
|
|
|
|
|
|
|
|
|
|
| 477 |
|
| 478 |
# demo = gr.ChatInterface(
|
| 479 |
# respond,
|
| 480 |
# additional_inputs=[
|
| 481 |
-
# gr.Textbox(
|
| 482 |
-
# value=("You are a Chatbot who only answers spiritual questions based on three religiousscriptures (a) Hindu - e.g.Bhagwadgita, (b) Jewish, e.g. Torah, (c) Christian, e.g., Bible"
|
| 483 |
-
# ". You will ffer all three perspectives. You decline answering other questions that do not relate to spirituality."),
|
| 484 |
-
# label="System message",
|
| 485 |
-
# ),
|
| 486 |
# gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"),
|
| 487 |
# gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
|
| 488 |
# gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
|
|
@@ -490,126 +609,7 @@
|
|
| 490 |
# )
|
| 491 |
|
| 492 |
# if __name__ == "__main__":
|
| 493 |
-
#
|
| 494 |
# demo.launch(share=True)
|
| 495 |
|
| 496 |
-
import os
|
| 497 |
-
import gradio as gr
|
| 498 |
-
|
| 499 |
-
# ---- llama.cpp backend (fast CPU) ----
|
| 500 |
-
from llama_cpp import Llama
|
| 501 |
-
|
| 502 |
-
# ---- to list files in a repo and pick a GGUF automatically ----
|
| 503 |
-
from huggingface_hub import list_repo_files
|
| 504 |
-
|
| 505 |
-
# ----------------- Config -----------------
|
| 506 |
-
# You can override these via Space "Settings → Variables"
|
| 507 |
-
# If MODEL_REPO is set, it's tried first; otherwise we try the CANDIDATE_REPOS below.
|
| 508 |
-
MODEL_REPO = os.getenv("MODEL_REPO", "").strip() or None
|
| 509 |
-
|
| 510 |
-
# Known small GGUF chat repos (fast & lightweight). We'll try them in order.
|
| 511 |
-
CANDIDATE_REPOS = [
|
| 512 |
-
MODEL_REPO, # user-preferred first (may be None)
|
| 513 |
-
"Qwen/Qwen2.5-0.5B-Instruct-GGUF",
|
| 514 |
-
"Qwen/Qwen2-0.5B-Instruct-GGUF",
|
| 515 |
-
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
|
| 516 |
-
"bartowski/Qwen2.5-0.5B-Instruct-GGUF",
|
| 517 |
-
]
|
| 518 |
-
|
| 519 |
-
# Best-to-worst file name patterns to prefer when multiple GGUFs are present.
|
| 520 |
-
PREFERRED_PATTERNS = [
|
| 521 |
-
"q4_k_m.gguf", "Q4_K_M.gguf",
|
| 522 |
-
"q4_0.gguf", "Q4_0.gguf",
|
| 523 |
-
"q5_k_m.gguf", "Q5_K_M.gguf",
|
| 524 |
-
".gguf", # catch-all
|
| 525 |
-
]
|
| 526 |
-
|
| 527 |
-
# Runtime knobs
|
| 528 |
-
N_THREADS = int(os.getenv("N_THREADS", str(os.cpu_count() or 4)))
|
| 529 |
-
CTX = int(os.getenv("CTX", "2048"))
|
| 530 |
-
|
| 531 |
-
SYSTEM_DEFAULT = (
|
| 532 |
-
"You are a Chatbot who only answers spiritual questions based on Indian scriptures "
|
| 533 |
-
"and politely decline other questions."
|
| 534 |
-
)
|
| 535 |
-
|
| 536 |
-
# --------------- GGUF Picker ---------------
|
| 537 |
-
def pick_repo_and_file():
|
| 538 |
-
"""Return (repo_id, gguf_filename) by scanning candidate repos for a preferred GGUF."""
|
| 539 |
-
tried = []
|
| 540 |
-
for repo in [r for r in CANDIDATE_REPOS if r]: # drop None
|
| 541 |
-
try:
|
| 542 |
-
files = list_repo_files(repo)
|
| 543 |
-
except Exception:
|
| 544 |
-
tried.append(f"{repo} (list failed)")
|
| 545 |
-
continue
|
| 546 |
-
ggufs = [f for f in files if f.lower().endswith(".gguf")]
|
| 547 |
-
if not ggufs:
|
| 548 |
-
tried.append(f"{repo} (no .gguf)")
|
| 549 |
-
continue
|
| 550 |
-
# pick by pattern preference
|
| 551 |
-
for pat in PREFERRED_PATTERNS:
|
| 552 |
-
for f in ggufs:
|
| 553 |
-
if pat in f:
|
| 554 |
-
return repo, f
|
| 555 |
-
tried_str = " | ".join(tried) if tried else "(none)"
|
| 556 |
-
raise RuntimeError(
|
| 557 |
-
"No GGUF file found in any candidate repo.\n"
|
| 558 |
-
f"Tried: {tried_str}\n"
|
| 559 |
-
"Tip: set MODEL_REPO to a GGUF repo like 'Qwen/Qwen2.5-0.5B-Instruct-GGUF' "
|
| 560 |
-
"or 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF'."
|
| 561 |
-
)
|
| 562 |
-
|
| 563 |
-
REPO_ID, FILENAME = pick_repo_and_file()
|
| 564 |
-
print(f"🔧 Loading GGUF from {REPO_ID}/{FILENAME} | threads={N_THREADS}, ctx={CTX}")
|
| 565 |
-
|
| 566 |
-
llm = Llama.from_pretrained(
|
| 567 |
-
repo_id=REPO_ID,
|
| 568 |
-
filename=FILENAME,
|
| 569 |
-
n_ctx=CTX,
|
| 570 |
-
n_threads=N_THREADS,
|
| 571 |
-
n_gpu_layers=0, # CPU only
|
| 572 |
-
logits_all=False,
|
| 573 |
-
verbose=False,
|
| 574 |
-
)
|
| 575 |
-
|
| 576 |
-
def respond(message, history, system_message, max_tokens, temperature, top_p):
|
| 577 |
-
sysmsg = system_message or SYSTEM_DEFAULT
|
| 578 |
-
msgs = [{"role": "system", "content": sysmsg}]
|
| 579 |
-
for u, a in (history or []):
|
| 580 |
-
if u:
|
| 581 |
-
msgs.append({"role": "user", "content": u})
|
| 582 |
-
if a:
|
| 583 |
-
msgs.append({"role": "assistant", "content": a})
|
| 584 |
-
msgs.append({"role": "user", "content": message})
|
| 585 |
-
|
| 586 |
-
stream = llm.create_chat_completion(
|
| 587 |
-
messages=msgs,
|
| 588 |
-
temperature=float(temperature),
|
| 589 |
-
top_p=float(top_p),
|
| 590 |
-
max_tokens=int(max_tokens),
|
| 591 |
-
stream=True,
|
| 592 |
-
)
|
| 593 |
-
acc = ""
|
| 594 |
-
for chunk in stream:
|
| 595 |
-
delta = chunk["choices"][0]["delta"]
|
| 596 |
-
tok = delta.get("content", "")
|
| 597 |
-
if tok:
|
| 598 |
-
acc += tok
|
| 599 |
-
yield acc
|
| 600 |
-
|
| 601 |
-
demo = gr.ChatInterface(
|
| 602 |
-
respond,
|
| 603 |
-
additional_inputs=[
|
| 604 |
-
gr.Textbox(value=SYSTEM_DEFAULT, label="System message"),
|
| 605 |
-
gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"),
|
| 606 |
-
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
|
| 607 |
-
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
|
| 608 |
-
],
|
| 609 |
-
)
|
| 610 |
-
|
| 611 |
-
if __name__ == "__main__":
|
| 612 |
-
print(f"🧵 Threads: {N_THREADS}")
|
| 613 |
-
demo.launch(share=True)
|
| 614 |
-
|
| 615 |
|
|
|
|
| 403 |
|
| 404 |
|
| 405 |
|
| 406 |
+
import os
|
| 407 |
+
import gradio as gr
|
| 408 |
+
import torch
|
| 409 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 410 |
+
|
| 411 |
+
# You can override this via Space secret: MODEL_ID=Qwen/Qwen2-0.5B-Instruct (etc.)
|
| 412 |
+
MODEL_ID = os.getenv("MODEL_ID", "TinyLlama/TinyLlama-1.1B-Chat-v1.0")
|
| 413 |
+
|
| 414 |
+
# Load once at startup
|
| 415 |
+
print(f"🔧 Loading local model: {MODEL_ID}")
|
| 416 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
|
| 417 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 418 |
+
MODEL_ID,
|
| 419 |
+
torch_dtype=torch.float32, # CPU-friendly
|
| 420 |
+
)
|
| 421 |
+
model.eval()
|
| 422 |
+
|
| 423 |
+
def build_prompt(system_message: str, history, user_msg: str) -> str:
|
| 424 |
+
"""Try to use the model's chat template if present; otherwise use a generic prompt."""
|
| 425 |
+
messages = []
|
| 426 |
+
if system_message:
|
| 427 |
+
messages.append({"role": "system", "content": system_message})
|
| 428 |
+
for u, a in (history or []):
|
| 429 |
+
if u:
|
| 430 |
+
messages.append({"role": "user", "content": u})
|
| 431 |
+
if a:
|
| 432 |
+
messages.append({"role": "assistant", "content": a})
|
| 433 |
+
messages.append({"role": "user", "content": user_msg})
|
| 434 |
+
|
| 435 |
+
# Use chat template when available
|
| 436 |
+
try:
|
| 437 |
+
if getattr(tokenizer, "chat_template", None):
|
| 438 |
+
return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 439 |
+
except Exception:
|
| 440 |
+
pass
|
| 441 |
+
|
| 442 |
+
# Fallback generic formatting
|
| 443 |
+
parts = []
|
| 444 |
+
if system_message:
|
| 445 |
+
parts.append(f"System: {system_message}")
|
| 446 |
+
for u, a in (history or []):
|
| 447 |
+
if u:
|
| 448 |
+
parts.append(f"User: {u}")
|
| 449 |
+
if a:
|
| 450 |
+
parts.append(f"Assistant: {a}")
|
| 451 |
+
parts.append(f"User: {user_msg}")
|
| 452 |
+
parts.append("Assistant:")
|
| 453 |
+
return "\n".join(parts)
|
| 454 |
+
|
| 455 |
+
def respond(message, history, system_message, max_tokens, temperature, top_p):
|
| 456 |
+
prompt = build_prompt(system_message, history, message)
|
| 457 |
+
inputs = tokenizer(prompt, return_tensors="pt")
|
| 458 |
+
with torch.no_grad():
|
| 459 |
+
outputs = model.generate(
|
| 460 |
+
**inputs,
|
| 461 |
+
max_new_tokens=int(max_tokens),
|
| 462 |
+
do_sample=True,
|
| 463 |
+
temperature=float(temperature),
|
| 464 |
+
top_p=float(top_p),
|
| 465 |
+
pad_token_id=tokenizer.eos_token_id,
|
| 466 |
+
eos_token_id=tokenizer.eos_token_id,
|
| 467 |
+
)
|
| 468 |
+
# Decode only the newly generated portion
|
| 469 |
+
gen_ids = outputs[0][inputs["input_ids"].shape[1]:]
|
| 470 |
+
text = tokenizer.decode(gen_ids, skip_special_tokens=True)
|
| 471 |
+
|
| 472 |
+
# Stream the text in chunks so the UI feels live
|
| 473 |
+
acc = ""
|
| 474 |
+
for i in range(0, len(text), 40):
|
| 475 |
+
acc += text[i:i+40]
|
| 476 |
+
yield acc
|
| 477 |
+
|
| 478 |
+
demo = gr.ChatInterface(
|
| 479 |
+
respond,
|
| 480 |
+
additional_inputs=[
|
| 481 |
+
gr.Textbox(
|
| 482 |
+
value=("You are a Chatbot who only answers spiritual questions based on three religiousscriptures (a) Hindu - e.g.Bhagwadgita, (b) Jewish, e.g. Torah, (c) Christian, e.g., Bible"
|
| 483 |
+
". You will ffer all three perspectives. You decline answering other questions that do not relate to spirituality."),
|
| 484 |
+
label="System message",
|
| 485 |
+
),
|
| 486 |
+
gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"),
|
| 487 |
+
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
|
| 488 |
+
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
|
| 489 |
+
],
|
| 490 |
+
)
|
| 491 |
+
|
| 492 |
+
if __name__ == "__main__":
|
| 493 |
+
# share=True gives you a public link automatically
|
| 494 |
+
demo.launch(share=True)
|
| 495 |
+
|
| 496 |
# import os
|
| 497 |
# import gradio as gr
|
| 498 |
+
|
| 499 |
+
# # ---- llama.cpp backend (fast CPU) ----
|
| 500 |
+
# from llama_cpp import Llama
|
| 501 |
+
|
| 502 |
+
# # ---- to list files in a repo and pick a GGUF automatically ----
|
| 503 |
+
# from huggingface_hub import list_repo_files
|
| 504 |
+
|
| 505 |
+
# # ----------------- Config -----------------
|
| 506 |
+
# # You can override these via Space "Settings → Variables"
|
| 507 |
+
# # If MODEL_REPO is set, it's tried first; otherwise we try the CANDIDATE_REPOS below.
|
| 508 |
+
# MODEL_REPO = os.getenv("MODEL_REPO", "").strip() or None
|
| 509 |
+
|
| 510 |
+
# # Known small GGUF chat repos (fast & lightweight). We'll try them in order.
|
| 511 |
+
# CANDIDATE_REPOS = [
|
| 512 |
+
# MODEL_REPO, # user-preferred first (may be None)
|
| 513 |
+
# "Qwen/Qwen2.5-0.5B-Instruct-GGUF",
|
| 514 |
+
# "Qwen/Qwen2-0.5B-Instruct-GGUF",
|
| 515 |
+
# "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
|
| 516 |
+
# "bartowski/Qwen2.5-0.5B-Instruct-GGUF",
|
| 517 |
+
# ]
|
| 518 |
+
|
| 519 |
+
# # Best-to-worst file name patterns to prefer when multiple GGUFs are present.
|
| 520 |
+
# PREFERRED_PATTERNS = [
|
| 521 |
+
# "q4_k_m.gguf", "Q4_K_M.gguf",
|
| 522 |
+
# "q4_0.gguf", "Q4_0.gguf",
|
| 523 |
+
# "q5_k_m.gguf", "Q5_K_M.gguf",
|
| 524 |
+
# ".gguf", # catch-all
|
| 525 |
+
# ]
|
| 526 |
+
|
| 527 |
+
# # Runtime knobs
|
| 528 |
+
# N_THREADS = int(os.getenv("N_THREADS", str(os.cpu_count() or 4)))
|
| 529 |
+
# CTX = int(os.getenv("CTX", "2048"))
|
| 530 |
+
|
| 531 |
+
# SYSTEM_DEFAULT = (
|
| 532 |
+
# "You are a Chatbot who only answers spiritual questions based on Indian scriptures "
|
| 533 |
+
# "and politely decline other questions."
|
| 534 |
# )
|
|
|
|
| 535 |
|
| 536 |
+
# # --------------- GGUF Picker ---------------
|
| 537 |
+
# def pick_repo_and_file():
|
| 538 |
+
# """Return (repo_id, gguf_filename) by scanning candidate repos for a preferred GGUF."""
|
| 539 |
+
# tried = []
|
| 540 |
+
# for repo in [r for r in CANDIDATE_REPOS if r]: # drop None
|
| 541 |
+
# try:
|
| 542 |
+
# files = list_repo_files(repo)
|
| 543 |
+
# except Exception:
|
| 544 |
+
# tried.append(f"{repo} (list failed)")
|
| 545 |
+
# continue
|
| 546 |
+
# ggufs = [f for f in files if f.lower().endswith(".gguf")]
|
| 547 |
+
# if not ggufs:
|
| 548 |
+
# tried.append(f"{repo} (no .gguf)")
|
| 549 |
+
# continue
|
| 550 |
+
# # pick by pattern preference
|
| 551 |
+
# for pat in PREFERRED_PATTERNS:
|
| 552 |
+
# for f in ggufs:
|
| 553 |
+
# if pat in f:
|
| 554 |
+
# return repo, f
|
| 555 |
+
# tried_str = " | ".join(tried) if tried else "(none)"
|
| 556 |
+
# raise RuntimeError(
|
| 557 |
+
# "No GGUF file found in any candidate repo.\n"
|
| 558 |
+
# f"Tried: {tried_str}\n"
|
| 559 |
+
# "Tip: set MODEL_REPO to a GGUF repo like 'Qwen/Qwen2.5-0.5B-Instruct-GGUF' "
|
| 560 |
+
# "or 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF'."
|
| 561 |
+
# )
|
| 562 |
|
| 563 |
+
# REPO_ID, FILENAME = pick_repo_and_file()
|
| 564 |
+
# print(f"🔧 Loading GGUF from {REPO_ID}/{FILENAME} | threads={N_THREADS}, ctx={CTX}")
|
| 565 |
+
|
| 566 |
+
# llm = Llama.from_pretrained(
|
| 567 |
+
# repo_id=REPO_ID,
|
| 568 |
+
# filename=FILENAME,
|
| 569 |
+
# n_ctx=CTX,
|
| 570 |
+
# n_threads=N_THREADS,
|
| 571 |
+
# n_gpu_layers=0, # CPU only
|
| 572 |
+
# logits_all=False,
|
| 573 |
+
# verbose=False,
|
| 574 |
+
# )
|
| 575 |
|
| 576 |
+
# def respond(message, history, system_message, max_tokens, temperature, top_p):
|
| 577 |
+
# sysmsg = system_message or SYSTEM_DEFAULT
|
| 578 |
+
# msgs = [{"role": "system", "content": sysmsg}]
|
|
|
|
| 579 |
# for u, a in (history or []):
|
| 580 |
# if u:
|
| 581 |
+
# msgs.append({"role": "user", "content": u})
|
| 582 |
# if a:
|
| 583 |
+
# msgs.append({"role": "assistant", "content": a})
|
| 584 |
+
# msgs.append({"role": "user", "content": message})
|
| 585 |
+
|
| 586 |
+
# stream = llm.create_chat_completion(
|
| 587 |
+
# messages=msgs,
|
| 588 |
+
# temperature=float(temperature),
|
| 589 |
+
# top_p=float(top_p),
|
| 590 |
+
# max_tokens=int(max_tokens),
|
| 591 |
+
# stream=True,
|
| 592 |
+
# )
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 593 |
# acc = ""
|
| 594 |
+
# for chunk in stream:
|
| 595 |
+
# delta = chunk["choices"][0]["delta"]
|
| 596 |
+
# tok = delta.get("content", "")
|
| 597 |
+
# if tok:
|
| 598 |
+
# acc += tok
|
| 599 |
+
# yield acc
|
| 600 |
|
| 601 |
# demo = gr.ChatInterface(
|
| 602 |
# respond,
|
| 603 |
# additional_inputs=[
|
| 604 |
+
# gr.Textbox(value=SYSTEM_DEFAULT, label="System message"),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 605 |
# gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"),
|
| 606 |
# gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
|
| 607 |
# gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
|
|
|
|
| 609 |
# )
|
| 610 |
|
| 611 |
# if __name__ == "__main__":
|
| 612 |
+
# print(f"🧵 Threads: {N_THREADS}")
|
| 613 |
# demo.launch(share=True)
|
| 614 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 615 |
|