import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline import torch # Model loading with optimized settings MODEL_NAME = "Qwen/Qwen3-0.6B" cache_dir = "./model_cache" # Load tokenizer with trust_remote_code for model-specific features tokenizer = AutoTokenizer.from_pretrained( MODEL_NAME, trust_remote_code=True, cache_dir=cache_dir ) # Load model with GPU acceleration and memory optimization model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, trust_remote_code=True, torch_dtype=torch.float16, # FP16 for reduced memory usage device_map="auto", # Let accelerate handle device allocation cache_dir=cache_dir ).eval() # Set to evaluation mode # Create text generation pipeline (no explicit device needed with device_map) text_generator = pipeline( "text-generation", model=model, tokenizer=tokenizer, pad_token_id=tokenizer.eos_token_id # Critical fix for generation stability ) def generate_response(prompt, max_new_tokens=256, temperature=0.7, top_p=0.9): """Generate response with safe defaults and error handling""" try: response = text_generator( prompt, max_new_tokens=int(max_new_tokens), temperature=float(temperature), top_p=float(top_p), do_sample=True, truncation=True, max_length=tokenizer.model_max_length # Prevent overflow ) return response[0]["generated_text"] except Exception as e: return f"⚠️ Model Error: {str(e)}\n\nTry reducing input length or adjusting generation parameters." # Gradio interface with enhanced UI with gr.Blocks(theme="soft", title="Qwen3-0.6B Chat Interface") as demo: gr.Markdown("# 🧠 Qwen3-0.6B Text-to-Text Chat") gr.Markdown("⚡ Optimized for HuggingFace Spaces with GPU acceleration") with gr.Row(): with gr.Column(scale=2): prompt = gr.Textbox( label="User Input", placeholder="Ask me anything...", lines=5 ) with gr.Accordion("⚙️ Generation Parameters", open=False): max_new_tokens = gr.Slider( minimum=32, maximum=1024, # Increased max for long-form generation value=256, step=32, label="Max New Tokens" ) temperature = gr.Slider( minimum=0.1, maximum=1.5, # Extended range for creative tasks value=0.7, step=0.1, label="Temperature" ) top_p = gr.Slider( minimum=0.1, maximum=1.0, value=0.9, step=0.1, label="Top-p Sampling" ) with gr.Column(scale=2): output = gr.Textbox(label="Model Response", lines=10, show_copy_button=True) submit = gr.Button("💬 Generate Response", variant="primary") submit.click( fn=generate_response, inputs=[prompt, max_new_tokens, temperature, top_p], outputs=output ) gr.Examples( examples=[ ["Explain quantum computing in simple terms"], ["Write a poem about autumn leaves"], ["Solve this math problem: 2x + 5 = 17"] ], inputs=prompt, label="🎯 Example Prompts" ) if __name__ == "__main__": demo.launch()