import gradio as gr
from transformers import AutoImageProcessor, AutoTokenizer
from optimum.onnxruntime import ORTModelForVision2Seq
from PIL import Image

# Load model, tokenizer, and processor
processor = AutoImageProcessor.from_pretrained("WinKawaks/vit-small-patch16-224")
tokenizer = AutoTokenizer.from_pretrained("WinKawaks/vit-small-patch16-224")
model = ORTModelForVision2Seq.from_pretrained(
    "WinKawaks/vit-small-patch16-224", 
    export=True,
    from_transformers=False
)

def run(image):
    if image is None:
        return "No image provided."

    # Preprocess
    inputs = processor(images=image, return_tensors="pt")

    # Generate
    outputs = model.generate(**inputs, max_length=64)

    # Decode
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return text


# --- Gradio UI ---
demo = gr.Interface(
    fn=run,
    inputs=gr.Image(type="pil"),
    outputs="text",
    title="ViT Vision2Seq ONNX Demo",
    description="Upload an image → get generated text from WinKawaks/vit-small-patch16-224 (ONNX)."
)

demo.launch()