ibm-granite
/

granite-vision-3.1-2b-preview

@@ -54,6 +54,7 @@ Granite Vision model is supported natively `transformers` from the `main` branch
 ```python
 from transformers import AutoProcessor, AutoModelForVision2Seq
 import torch
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -63,14 +64,19 @@ processor = AutoProcessor.from_pretrained(model_path)
 model = AutoModelForVision2Seq.from_pretrained(model_path).to(device)
 # prepare image and text prompt, using the appropriate prompt template
-url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"
 conversation = [
     {
         "role": "user",
         "content": [
-            {"type": "image", "url": url},
-            {"type": "text", "text": "What is shown in this image?"},
         ],
     },
 ]
@@ -101,6 +107,8 @@ Then, copy the snippet from the section that is relevant for your use case.
 ```python
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 model_path = "ibm-granite/granite-vision-3.1-2b-preview"
@@ -118,9 +126,10 @@ sampling_params = SamplingParams(
 image_token = "<image>"
 system_prompt = "<|system|>\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n"
-question = "What type of flower is this?"
 prompt = f"{system_prompt}<|user|>\n{image_token}\n{question}\n<|assistant|>\n"
-image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
 print(image)
 # Build the inputs to vLLM; the image is passed as `multi_modal_data`.

 ```python
 from transformers import AutoProcessor, AutoModelForVision2Seq
+from huggingface_hub import hf_hub_download
 import torch
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model = AutoModelForVision2Seq.from_pretrained(model_path).to(device)
 # prepare image and text prompt, using the appropriate prompt template
+img_path = hf_hub_download(repo_id=model_path, filename='example.png')
 conversation = [
+    {
+      "role": "system",
+      "content": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions."
+    },
     {
         "role": "user",
         "content": [
+            {"type": "image", "url": img_path},
+            {"type": "text", "text": "What is the highest scoring model on ChartQA and what is its score?"},
         ],
     },
 ]
 ```python
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
+from huggingface_hub import hf_hub_download
+from PIL import Image
 model_path = "ibm-granite/granite-vision-3.1-2b-preview"
 image_token = "<image>"
 system_prompt = "<|system|>\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n"
+question = "What is the highest scoring model on ChartQA and what is its score?"
 prompt = f"{system_prompt}<|user|>\n{image_token}\n{question}\n<|assistant|>\n"
+img_path = hf_hub_download(repo_id=model_path, filename='example.png')
+image = Image.open(img_path).convert("RGB")
 print(image)
 # Build the inputs to vLLM; the image is passed as `multi_modal_data`.