rajeshlion commited on
Commit
4ab8ff5
·
verified ·
1 Parent(s): b91179c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +186 -186
app.py CHANGED
@@ -403,86 +403,205 @@
403
 
404
 
405
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
406
  # import os
407
  # import gradio as gr
408
- # import torch
409
- # from transformers import AutoTokenizer, AutoModelForCausalLM
410
-
411
- # # You can override this via Space secret: MODEL_ID=Qwen/Qwen2-0.5B-Instruct (etc.)
412
- # MODEL_ID = os.getenv("MODEL_ID", "TinyLlama/TinyLlama-1.1B-Chat-v1.0")
413
-
414
- # # Load once at startup
415
- # print(f"🔧 Loading local model: {MODEL_ID}")
416
- # tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
417
- # model = AutoModelForCausalLM.from_pretrained(
418
- # MODEL_ID,
419
- # torch_dtype=torch.float32, # CPU-friendly
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
420
  # )
421
- # model.eval()
422
 
423
- # def build_prompt(system_message: str, history, user_msg: str) -> str:
424
- # """Try to use the model's chat template if present; otherwise use a generic prompt."""
425
- # messages = []
426
- # if system_message:
427
- # messages.append({"role": "system", "content": system_message})
428
- # for u, a in (history or []):
429
- # if u:
430
- # messages.append({"role": "user", "content": u})
431
- # if a:
432
- # messages.append({"role": "assistant", "content": a})
433
- # messages.append({"role": "user", "content": user_msg})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
434
 
435
- # # Use chat template when available
436
- # try:
437
- # if getattr(tokenizer, "chat_template", None):
438
- # return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
439
- # except Exception:
440
- # pass
 
 
 
 
 
 
441
 
442
- # # Fallback generic formatting
443
- # parts = []
444
- # if system_message:
445
- # parts.append(f"System: {system_message}")
446
  # for u, a in (history or []):
447
  # if u:
448
- # parts.append(f"User: {u}")
449
  # if a:
450
- # parts.append(f"Assistant: {a}")
451
- # parts.append(f"User: {user_msg}")
452
- # parts.append("Assistant:")
453
- # return "\n".join(parts)
454
-
455
- # def respond(message, history, system_message, max_tokens, temperature, top_p):
456
- # prompt = build_prompt(system_message, history, message)
457
- # inputs = tokenizer(prompt, return_tensors="pt")
458
- # with torch.no_grad():
459
- # outputs = model.generate(
460
- # **inputs,
461
- # max_new_tokens=int(max_tokens),
462
- # do_sample=True,
463
- # temperature=float(temperature),
464
- # top_p=float(top_p),
465
- # pad_token_id=tokenizer.eos_token_id,
466
- # eos_token_id=tokenizer.eos_token_id,
467
- # )
468
- # # Decode only the newly generated portion
469
- # gen_ids = outputs[0][inputs["input_ids"].shape[1]:]
470
- # text = tokenizer.decode(gen_ids, skip_special_tokens=True)
471
-
472
- # # Stream the text in chunks so the UI feels live
473
  # acc = ""
474
- # for i in range(0, len(text), 40):
475
- # acc += text[i:i+40]
476
- # yield acc
 
 
 
477
 
478
  # demo = gr.ChatInterface(
479
  # respond,
480
  # additional_inputs=[
481
- # gr.Textbox(
482
- # value=("You are a Chatbot who only answers spiritual questions based on three religiousscriptures (a) Hindu - e.g.Bhagwadgita, (b) Jewish, e.g. Torah, (c) Christian, e.g., Bible"
483
- # ". You will ffer all three perspectives. You decline answering other questions that do not relate to spirituality."),
484
- # label="System message",
485
- # ),
486
  # gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"),
487
  # gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
488
  # gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
@@ -490,126 +609,7 @@
490
  # )
491
 
492
  # if __name__ == "__main__":
493
- # # share=True gives you a public link automatically
494
  # demo.launch(share=True)
495
 
496
- import os
497
- import gradio as gr
498
-
499
- # ---- llama.cpp backend (fast CPU) ----
500
- from llama_cpp import Llama
501
-
502
- # ---- to list files in a repo and pick a GGUF automatically ----
503
- from huggingface_hub import list_repo_files
504
-
505
- # ----------------- Config -----------------
506
- # You can override these via Space "Settings → Variables"
507
- # If MODEL_REPO is set, it's tried first; otherwise we try the CANDIDATE_REPOS below.
508
- MODEL_REPO = os.getenv("MODEL_REPO", "").strip() or None
509
-
510
- # Known small GGUF chat repos (fast & lightweight). We'll try them in order.
511
- CANDIDATE_REPOS = [
512
- MODEL_REPO, # user-preferred first (may be None)
513
- "Qwen/Qwen2.5-0.5B-Instruct-GGUF",
514
- "Qwen/Qwen2-0.5B-Instruct-GGUF",
515
- "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
516
- "bartowski/Qwen2.5-0.5B-Instruct-GGUF",
517
- ]
518
-
519
- # Best-to-worst file name patterns to prefer when multiple GGUFs are present.
520
- PREFERRED_PATTERNS = [
521
- "q4_k_m.gguf", "Q4_K_M.gguf",
522
- "q4_0.gguf", "Q4_0.gguf",
523
- "q5_k_m.gguf", "Q5_K_M.gguf",
524
- ".gguf", # catch-all
525
- ]
526
-
527
- # Runtime knobs
528
- N_THREADS = int(os.getenv("N_THREADS", str(os.cpu_count() or 4)))
529
- CTX = int(os.getenv("CTX", "2048"))
530
-
531
- SYSTEM_DEFAULT = (
532
- "You are a Chatbot who only answers spiritual questions based on Indian scriptures "
533
- "and politely decline other questions."
534
- )
535
-
536
- # --------------- GGUF Picker ---------------
537
- def pick_repo_and_file():
538
- """Return (repo_id, gguf_filename) by scanning candidate repos for a preferred GGUF."""
539
- tried = []
540
- for repo in [r for r in CANDIDATE_REPOS if r]: # drop None
541
- try:
542
- files = list_repo_files(repo)
543
- except Exception:
544
- tried.append(f"{repo} (list failed)")
545
- continue
546
- ggufs = [f for f in files if f.lower().endswith(".gguf")]
547
- if not ggufs:
548
- tried.append(f"{repo} (no .gguf)")
549
- continue
550
- # pick by pattern preference
551
- for pat in PREFERRED_PATTERNS:
552
- for f in ggufs:
553
- if pat in f:
554
- return repo, f
555
- tried_str = " | ".join(tried) if tried else "(none)"
556
- raise RuntimeError(
557
- "No GGUF file found in any candidate repo.\n"
558
- f"Tried: {tried_str}\n"
559
- "Tip: set MODEL_REPO to a GGUF repo like 'Qwen/Qwen2.5-0.5B-Instruct-GGUF' "
560
- "or 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF'."
561
- )
562
-
563
- REPO_ID, FILENAME = pick_repo_and_file()
564
- print(f"🔧 Loading GGUF from {REPO_ID}/{FILENAME} | threads={N_THREADS}, ctx={CTX}")
565
-
566
- llm = Llama.from_pretrained(
567
- repo_id=REPO_ID,
568
- filename=FILENAME,
569
- n_ctx=CTX,
570
- n_threads=N_THREADS,
571
- n_gpu_layers=0, # CPU only
572
- logits_all=False,
573
- verbose=False,
574
- )
575
-
576
- def respond(message, history, system_message, max_tokens, temperature, top_p):
577
- sysmsg = system_message or SYSTEM_DEFAULT
578
- msgs = [{"role": "system", "content": sysmsg}]
579
- for u, a in (history or []):
580
- if u:
581
- msgs.append({"role": "user", "content": u})
582
- if a:
583
- msgs.append({"role": "assistant", "content": a})
584
- msgs.append({"role": "user", "content": message})
585
-
586
- stream = llm.create_chat_completion(
587
- messages=msgs,
588
- temperature=float(temperature),
589
- top_p=float(top_p),
590
- max_tokens=int(max_tokens),
591
- stream=True,
592
- )
593
- acc = ""
594
- for chunk in stream:
595
- delta = chunk["choices"][0]["delta"]
596
- tok = delta.get("content", "")
597
- if tok:
598
- acc += tok
599
- yield acc
600
-
601
- demo = gr.ChatInterface(
602
- respond,
603
- additional_inputs=[
604
- gr.Textbox(value=SYSTEM_DEFAULT, label="System message"),
605
- gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"),
606
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
607
- gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
608
- ],
609
- )
610
-
611
- if __name__ == "__main__":
612
- print(f"🧵 Threads: {N_THREADS}")
613
- demo.launch(share=True)
614
-
615
 
 
403
 
404
 
405
 
406
+ import os
407
+ import gradio as gr
408
+ import torch
409
+ from transformers import AutoTokenizer, AutoModelForCausalLM
410
+
411
+ # You can override this via Space secret: MODEL_ID=Qwen/Qwen2-0.5B-Instruct (etc.)
412
+ MODEL_ID = os.getenv("MODEL_ID", "TinyLlama/TinyLlama-1.1B-Chat-v1.0")
413
+
414
+ # Load once at startup
415
+ print(f"🔧 Loading local model: {MODEL_ID}")
416
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
417
+ model = AutoModelForCausalLM.from_pretrained(
418
+ MODEL_ID,
419
+ torch_dtype=torch.float32, # CPU-friendly
420
+ )
421
+ model.eval()
422
+
423
+ def build_prompt(system_message: str, history, user_msg: str) -> str:
424
+ """Try to use the model's chat template if present; otherwise use a generic prompt."""
425
+ messages = []
426
+ if system_message:
427
+ messages.append({"role": "system", "content": system_message})
428
+ for u, a in (history or []):
429
+ if u:
430
+ messages.append({"role": "user", "content": u})
431
+ if a:
432
+ messages.append({"role": "assistant", "content": a})
433
+ messages.append({"role": "user", "content": user_msg})
434
+
435
+ # Use chat template when available
436
+ try:
437
+ if getattr(tokenizer, "chat_template", None):
438
+ return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
439
+ except Exception:
440
+ pass
441
+
442
+ # Fallback generic formatting
443
+ parts = []
444
+ if system_message:
445
+ parts.append(f"System: {system_message}")
446
+ for u, a in (history or []):
447
+ if u:
448
+ parts.append(f"User: {u}")
449
+ if a:
450
+ parts.append(f"Assistant: {a}")
451
+ parts.append(f"User: {user_msg}")
452
+ parts.append("Assistant:")
453
+ return "\n".join(parts)
454
+
455
+ def respond(message, history, system_message, max_tokens, temperature, top_p):
456
+ prompt = build_prompt(system_message, history, message)
457
+ inputs = tokenizer(prompt, return_tensors="pt")
458
+ with torch.no_grad():
459
+ outputs = model.generate(
460
+ **inputs,
461
+ max_new_tokens=int(max_tokens),
462
+ do_sample=True,
463
+ temperature=float(temperature),
464
+ top_p=float(top_p),
465
+ pad_token_id=tokenizer.eos_token_id,
466
+ eos_token_id=tokenizer.eos_token_id,
467
+ )
468
+ # Decode only the newly generated portion
469
+ gen_ids = outputs[0][inputs["input_ids"].shape[1]:]
470
+ text = tokenizer.decode(gen_ids, skip_special_tokens=True)
471
+
472
+ # Stream the text in chunks so the UI feels live
473
+ acc = ""
474
+ for i in range(0, len(text), 40):
475
+ acc += text[i:i+40]
476
+ yield acc
477
+
478
+ demo = gr.ChatInterface(
479
+ respond,
480
+ additional_inputs=[
481
+ gr.Textbox(
482
+ value=("You are a Chatbot who only answers spiritual questions based on three religiousscriptures (a) Hindu - e.g.Bhagwadgita, (b) Jewish, e.g. Torah, (c) Christian, e.g., Bible"
483
+ ". You will ffer all three perspectives. You decline answering other questions that do not relate to spirituality."),
484
+ label="System message",
485
+ ),
486
+ gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"),
487
+ gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
488
+ gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
489
+ ],
490
+ )
491
+
492
+ if __name__ == "__main__":
493
+ # share=True gives you a public link automatically
494
+ demo.launch(share=True)
495
+
496
  # import os
497
  # import gradio as gr
498
+
499
+ # # ---- llama.cpp backend (fast CPU) ----
500
+ # from llama_cpp import Llama
501
+
502
+ # # ---- to list files in a repo and pick a GGUF automatically ----
503
+ # from huggingface_hub import list_repo_files
504
+
505
+ # # ----------------- Config -----------------
506
+ # # You can override these via Space "Settings → Variables"
507
+ # # If MODEL_REPO is set, it's tried first; otherwise we try the CANDIDATE_REPOS below.
508
+ # MODEL_REPO = os.getenv("MODEL_REPO", "").strip() or None
509
+
510
+ # # Known small GGUF chat repos (fast & lightweight). We'll try them in order.
511
+ # CANDIDATE_REPOS = [
512
+ # MODEL_REPO, # user-preferred first (may be None)
513
+ # "Qwen/Qwen2.5-0.5B-Instruct-GGUF",
514
+ # "Qwen/Qwen2-0.5B-Instruct-GGUF",
515
+ # "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
516
+ # "bartowski/Qwen2.5-0.5B-Instruct-GGUF",
517
+ # ]
518
+
519
+ # # Best-to-worst file name patterns to prefer when multiple GGUFs are present.
520
+ # PREFERRED_PATTERNS = [
521
+ # "q4_k_m.gguf", "Q4_K_M.gguf",
522
+ # "q4_0.gguf", "Q4_0.gguf",
523
+ # "q5_k_m.gguf", "Q5_K_M.gguf",
524
+ # ".gguf", # catch-all
525
+ # ]
526
+
527
+ # # Runtime knobs
528
+ # N_THREADS = int(os.getenv("N_THREADS", str(os.cpu_count() or 4)))
529
+ # CTX = int(os.getenv("CTX", "2048"))
530
+
531
+ # SYSTEM_DEFAULT = (
532
+ # "You are a Chatbot who only answers spiritual questions based on Indian scriptures "
533
+ # "and politely decline other questions."
534
  # )
 
535
 
536
+ # # --------------- GGUF Picker ---------------
537
+ # def pick_repo_and_file():
538
+ # """Return (repo_id, gguf_filename) by scanning candidate repos for a preferred GGUF."""
539
+ # tried = []
540
+ # for repo in [r for r in CANDIDATE_REPOS if r]: # drop None
541
+ # try:
542
+ # files = list_repo_files(repo)
543
+ # except Exception:
544
+ # tried.append(f"{repo} (list failed)")
545
+ # continue
546
+ # ggufs = [f for f in files if f.lower().endswith(".gguf")]
547
+ # if not ggufs:
548
+ # tried.append(f"{repo} (no .gguf)")
549
+ # continue
550
+ # # pick by pattern preference
551
+ # for pat in PREFERRED_PATTERNS:
552
+ # for f in ggufs:
553
+ # if pat in f:
554
+ # return repo, f
555
+ # tried_str = " | ".join(tried) if tried else "(none)"
556
+ # raise RuntimeError(
557
+ # "No GGUF file found in any candidate repo.\n"
558
+ # f"Tried: {tried_str}\n"
559
+ # "Tip: set MODEL_REPO to a GGUF repo like 'Qwen/Qwen2.5-0.5B-Instruct-GGUF' "
560
+ # "or 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF'."
561
+ # )
562
 
563
+ # REPO_ID, FILENAME = pick_repo_and_file()
564
+ # print(f"🔧 Loading GGUF from {REPO_ID}/{FILENAME} | threads={N_THREADS}, ctx={CTX}")
565
+
566
+ # llm = Llama.from_pretrained(
567
+ # repo_id=REPO_ID,
568
+ # filename=FILENAME,
569
+ # n_ctx=CTX,
570
+ # n_threads=N_THREADS,
571
+ # n_gpu_layers=0, # CPU only
572
+ # logits_all=False,
573
+ # verbose=False,
574
+ # )
575
 
576
+ # def respond(message, history, system_message, max_tokens, temperature, top_p):
577
+ # sysmsg = system_message or SYSTEM_DEFAULT
578
+ # msgs = [{"role": "system", "content": sysmsg}]
 
579
  # for u, a in (history or []):
580
  # if u:
581
+ # msgs.append({"role": "user", "content": u})
582
  # if a:
583
+ # msgs.append({"role": "assistant", "content": a})
584
+ # msgs.append({"role": "user", "content": message})
585
+
586
+ # stream = llm.create_chat_completion(
587
+ # messages=msgs,
588
+ # temperature=float(temperature),
589
+ # top_p=float(top_p),
590
+ # max_tokens=int(max_tokens),
591
+ # stream=True,
592
+ # )
 
 
 
 
 
 
 
 
 
 
 
 
 
593
  # acc = ""
594
+ # for chunk in stream:
595
+ # delta = chunk["choices"][0]["delta"]
596
+ # tok = delta.get("content", "")
597
+ # if tok:
598
+ # acc += tok
599
+ # yield acc
600
 
601
  # demo = gr.ChatInterface(
602
  # respond,
603
  # additional_inputs=[
604
+ # gr.Textbox(value=SYSTEM_DEFAULT, label="System message"),
 
 
 
 
605
  # gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"),
606
  # gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
607
  # gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
 
609
  # )
610
 
611
  # if __name__ == "__main__":
612
+ # print(f"🧵 Threads: {N_THREADS}")
613
  # demo.launch(share=True)
614
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
615