import gradio as gr from transformers import AutoImageProcessor, AutoTokenizer from optimum.onnxruntime import ORTModelForVision2Seq from PIL import Image # Load model, tokenizer, and processor processor = AutoImageProcessor.from_pretrained("WinKawaks/vit-small-patch16-224") tokenizer = AutoTokenizer.from_pretrained("WinKawaks/vit-small-patch16-224") model = ORTModelForVision2Seq.from_pretrained( "WinKawaks/vit-small-patch16-224", export=True, from_transformers=False ) def run(image): if image is None: return "No image provided." # Preprocess inputs = processor(images=image, return_tensors="pt") # Generate outputs = model.generate(**inputs, max_length=64) # Decode text = tokenizer.decode(outputs[0], skip_special_tokens=True) return text # --- Gradio UI --- demo = gr.Interface( fn=run, inputs=gr.Image(type="pil"), outputs="text", title="ViT Vision2Seq ONNX Demo", description="Upload an image → get generated text from WinKawaks/vit-small-patch16-224 (ONNX)." ) demo.launch()