import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import gradio as gr
from PIL import Image
from huggingface_hub import login
import os
import warnings

# 抑制警告
warnings.filterwarnings("ignore", category=RuntimeWarning)

# ========== 使用你的 secret 名称 fmv 登录 ==========
token = os.getenv("fmv")
if token:
    login(token=token)
    print("Successfully logged in with token!")
else:
    print("Warning: Token not found")
# ==========================================

# Hugging Face model repository path
model_path = "hiko1999/Qwen2-Wildfire-2B"

# Load model and processor
print(f"Loading model: {model_path}")
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_path, 
    torch_dtype=torch.bfloat16,
    device_map="cpu"
)
processor = AutoProcessor.from_pretrained(model_path)
print("Model loaded successfully!")

# Define prediction function
def predict(image):
    """Process image and generate description"""
    if image is None:
        return "Error: No image uploaded"
    
    try:
        # Build message with English prompt
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": image},
                    {"type": "text", "text": "Describe this wildfire scene in English. Include details about the fire intensity, affected area, and visible environmental conditions."}
                ]
            }
        ]
        
        # Process input
        text = processor.apply_chat_template(
            messages, 
            tokenize=False, 
            add_generation_prompt=True
        )
        image_inputs, video_inputs = process_vision_info(messages)
        inputs = processor(
            text=[text], 
            images=image_inputs, 
            videos=video_inputs, 
            padding=True, 
            return_tensors="pt"
        )
        
        # Ensure running on CPU
        inputs = inputs.to("cpu")
        
        # Generate output
        generated_ids = model.generate(
            **inputs, 
            max_new_tokens=256,
            do_sample=True,
            temperature=0.7
        )
        
        # Decode output
        generated_ids_trimmed = [
            out_ids[len(in_ids):] 
            for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        output_text = processor.batch_decode(
            generated_ids_trimmed, 
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False
        )
        
        return output_text[0]
        
    except Exception as e:
        return f"Prediction failed: {str(e)}"

# Gradio interface function
def gradio_interface(image):
    """Main function for Gradio interface"""
    result = predict(image)
    return result

# Create Gradio interface (all in English)
interface = gr.Interface(
    fn=gradio_interface,
    inputs=gr.Image(type="pil", label="Upload Wildfire Image"),
    outputs=gr.Textbox(label="AI Analysis Result", lines=10),
    title="🔥 Wildfire Scene Analysis System",
    description="Upload a wildfire-related image and AI will automatically analyze and describe the fire situation in English."
)

# Launch interface
if __name__ == "__main__":
    interface.launch(share=False)