Spaces:
Sleeping
Sleeping
| # -*- coding: utf-8 -*- | |
| """ | |
| Gradio 5 + ZeroGPU-ready chat. | |
| - Streams tokens with TextIteratorStreamer | |
| Refs: | |
| - ZeroGPU docs (per-call GPU, 60s default, dynamic duration, effect-free on non-ZeroGPU): | |
| https://huggingface.co/docs/hub/en/spaces-zerogpu | |
| - Using GPU Spaces: | |
| https://huggingface.co/docs/hub/en/spaces-gpus | |
| - Gradio ChatInterface (type="messages"): | |
| https://www.gradio.app/docs/gradio/chatinterface | |
| - Transformers chat templating: | |
| https://huggingface.co/docs/transformers/en/chat_templating | |
| - TextIteratorStreamer: | |
| https://huggingface.co/docs/transformers/en/internal/generation_utils | |
| - Meltemi Instruct v1.5 (Zephyr prompt format): | |
| https://huggingface.co/ilsp/Meltemi-7B-Instruct-v1.5 | |
| """ | |
| import os, threading, gc, re | |
| from typing import Iterable, List, Dict, Any | |
| import gradio as gr | |
| import spaces | |
| import torch | |
| from transformers import ( | |
| AutoModelForCausalLM, | |
| AutoTokenizer, | |
| TextIteratorStreamer, | |
| ) | |
| HAS_CUDA = bool(torch.cuda.is_available()) | |
| IS_ZEROGPU = True if os.getenv("SPACES_ZERO_GPU", None) else False | |
| if IS_ZEROGPU: | |
| torch.compiler.set_stance("force_eager") | |
| torch.set_float32_matmul_precision("high") | |
| torch.backends.cuda.matmul.allow_tf32 = True | |
| # ------------------------- Config -------------------------------------------- | |
| MODEL_ID = "ilsp/Meltemi-7B-v1.5" if HAS_CUDA else "HuggingFaceTB/SmolLM2-135M-Instruct" | |
| SYSTEM_PROMPT = "You are a helpful, concise assistant. Reply in the user's language when possible." | |
| ZEROGPU_DURATION_BASE = 20 | |
| ZEROGPU_DURATION_CAP = 60 | |
| CPU_MAX_NEW_TOKENS = 128 | |
| # Base defaults aim to respect ZeroGPU's 60s window. | |
| MAX_NEW_TOKENS = 256 | |
| TEMPERATURE = 0.7 | |
| TOP_P = 0.95 | |
| TOP_K = 50 | |
| REPETITION_PENALTY = 1.05 | |
| # ------------------------- Load model/tokenizer ------------------------ | |
| # Use bf16 on CUDA, fall back to fp32 on CPU for compatibility. | |
| dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32 | |
| model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=dtype) | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True) | |
| if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None: tokenizer.pad_token_id = tokenizer.eos_token_id | |
| def clear_cache(): | |
| if HAS_CUDA: | |
| try: | |
| torch.cuda.empty_cache() | |
| except Exception: | |
| pass | |
| gc.collect() | |
| def build_messages(user_text: str, history: List[Dict[str, str]], system_message: str) -> List[Dict[str, str]]: | |
| """Merge system + history + user into HF messages format.""" | |
| msgs: List[Dict[str, str]] = [] | |
| if not history and system_message: msgs.append({"role": "system", "content": system_message}) | |
| msgs.extend(history or []) | |
| msgs.append({"role": "user", "content": user_text}) | |
| return msgs | |
| def to_model_inputs(messages: List[Dict[str, str]]) -> Dict[str, torch.Tensor]: | |
| """ | |
| Create tokenized inputs. | |
| - If a chat template exists, | |
| use apply_chat_template(..., add_generation_prompt=True). | |
| - Otherwise fall back to a simple joined text prompt for base models. | |
| """ | |
| if getattr(tokenizer, "chat_template", None): | |
| return tokenizer.apply_chat_template( | |
| messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt" | |
| ) | |
| # Fallback: last user only, no headings that induce lists | |
| print("Chat template not found.") | |
| last_user = next((m["content"] for m in reversed(messages) if m["role"]=="user"), "") | |
| prompt = (messages[0]["content"] + "\n\n" if messages and messages[0]["role"]=="system" else "") + last_user | |
| return tokenizer(prompt, return_tensors="pt") | |
| CHATML_HEAD = re.compile(r'^\s*<\|im_start\|>\s*assistant\s*\n?', re.IGNORECASE) | |
| CHATML_TAIL = re.compile(r'\s*<\|im_end\|>\s*$', re.IGNORECASE) | |
| GEN_TAIL = re.compile(r'\s*(</s>|<\|end_of_text\|>)\s*$', re.IGNORECASE) | |
| def _clean_stream_chunk(text: str) -> str: | |
| t = CHATML_HEAD.sub("", text) | |
| t = CHATML_TAIL.sub("", t) | |
| t = GEN_TAIL.sub("", t) | |
| return t | |
| def estimate_duration(max_new_tokens: int) -> int: | |
| """ | |
| Heuristic duration budget for ZeroGPU (seconds). | |
| Shorter durations improve queue priority. | |
| """ | |
| secs = int(float(ZEROGPU_DURATION_BASE) * float(max_new_tokens) / float(MAX_NEW_TOKENS)) | |
| cap = int(ZEROGPU_DURATION_CAP) | |
| return min(secs, cap) | |
| def generate_on_accelerator(message: str, history: List[Dict[str, str]], system_message: str, max_new_tokens: int, temperature: float, top_k: int, top_p: float, repetition_penalty: float) -> Iterable[str]: | |
| """ | |
| ChatInterface callback that works both on ZeroGPU and CPU-only Spaces. | |
| The decorator is effect-free outside ZeroGPU, so this still runs on CPU. | |
| """ | |
| try: | |
| device = torch.device("cuda" if HAS_CUDA else "cpu") | |
| if device != model.device: model.to(device) | |
| messages = build_messages(message, history, system_message) | |
| inputs = to_model_inputs(messages) | |
| inputs = {k: v.to(device) for k, v in inputs.items()} | |
| # Reduce generation length on CPU to keep latency reasonable. | |
| eff_max_new_tokens = max_new_tokens if HAS_CUDA else min(max_new_tokens, CPU_MAX_NEW_TOKENS) | |
| streamer = TextIteratorStreamer( | |
| tokenizer, | |
| skip_prompt=True, | |
| decode_kwargs={"skip_special_tokens": True}, | |
| ) | |
| gen_kwargs = dict( | |
| **inputs, | |
| streamer=streamer, | |
| max_new_tokens=eff_max_new_tokens, | |
| do_sample=True, | |
| temperature=temperature, | |
| top_p=top_p, | |
| top_k=top_k, | |
| repetition_penalty=repetition_penalty, | |
| pad_token_id=tokenizer.pad_token_id, | |
| eos_token_id=tokenizer.eos_token_id, | |
| cache_implementation="static", # https://github.com/huggingface/transformers/issues/38501 | |
| ) | |
| # Run generation in a background thread and yield chunks as they arrive. | |
| thread = threading.Thread(target=model.generate, kwargs=gen_kwargs) | |
| thread.start() | |
| new_response = "" | |
| for text in streamer: | |
| new_response += text | |
| new_response = _clean_stream_chunk(new_response) | |
| yield new_response | |
| except Exception as e: | |
| print(e) | |
| raise gr.Error(e) | |
| finally: | |
| # Move back to CPU and free CUDA cache when using a GPU. | |
| if not HAS_CUDA: model.to("cpu") | |
| clear_cache() | |
| demo = gr.ChatInterface( | |
| fn=generate_on_accelerator, | |
| type="messages", # OpenAI-style histories: [{"role": "...", "content": "..."}] | |
| title=f"{MODEL_ID} Chat", | |
| description=(f"Chat with {MODEL_ID}"), | |
| additional_inputs=[ | |
| gr.Textbox(value=SYSTEM_PROMPT, label="System message"), | |
| gr.Slider(minimum=1, maximum=2048, value=MAX_NEW_TOKENS, step=1, label="Max new tokens"), | |
| gr.Slider(minimum=0.1, maximum=4.0, value=TEMPERATURE, step=0.1, label="Temperature"), | |
| gr.Slider(minimum=0, maximum=360, value=TOP_K, step=1, label="Top-k"), | |
| gr.Slider(minimum=0.1, maximum=1.0, value=TOP_P, step=0.05, label="Top-p"), | |
| gr.Slider(minimum=0.0, maximum=2.0, value=REPETITION_PENALTY, step=0.05, label="Repetition penalty"), | |
| ], | |
| examples=[ | |
| ["Summarize the following paragraph in Greek."], | |
| ["Translate this into English: Καλημέρα, τι κάνεις;"], | |
| ["Write a short outline about Greek islands."], | |
| ], | |
| cache_examples=False, | |
| ) | |
| demo.queue().launch() | |