LanguageBind
/

Video-LLaVA-7B-hf

@@ -70,7 +70,22 @@ def read_video_pyav(container, indices):
 model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")
 processor = VideoLlavaProcessor.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")
-prompt = "USER: <video>Why is this video funny? ASSISTANT:"
 video_path = hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset")
 container = av.open(video_path)
@@ -89,11 +104,22 @@ print(processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_to
 # Generate from images and videos mix
 url = "http://images.cocodataset.org/val2017/000000039769.jpg"
 image = Image.open(requests.get(url, stream=True).raw)
-prompt = [
-    "USER: <image> How many cats are there in the image? ASSISTANT:",
-    "USER: <video>Why is this video funny? ASSISTANT:"
 ]
-inputs = processor(text=prompt, images=image, videos=clip, padding=True, return_tensors="pt")
 # Generate
 generate_ids = model.generate(**inputs, max_length=50)

 model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")
 processor = VideoLlavaProcessor.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")
+# Define a chat histiry and use `apply_chat_template` to get correctly formatted prompt
+# Each value in "content" has to be a list of dicts with types ("text", "image", "video")
+conversation = [
+    {
+      "role": "user",
+      "content": [
+          {"type": "text", "text": "Why is this video funny?"},
+          {"type": "video"},
+        ],
+    },
+]
+# will be formatted as USER: <video>\nWhy is this video funny? ASSISTANT:"
+prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
 video_path = hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset")
 container = av.open(video_path)
 # Generate from images and videos mix
 url = "http://images.cocodataset.org/val2017/000000039769.jpg"
 image = Image.open(requests.get(url, stream=True).raw)
+# Define a chat histiry and use `apply_chat_template` to get correctly formatted prompt
+# Each value in "content" has to be a list of dicts with types ("text", "image", "video")
+conversation_image = [
+    {
+      "role": "user",
+      "content": [
+          {"type": "text", "text": "How many cats are there in the image?"},
+          {"type": "image"},
+        ],
+    },
 ]
+prompt_image = processor.apply_chat_template(conversation_image, add_generation_prompt=True)
+inputs = processor(text=[prompt_image, prompt], images=image, videos=clip, padding=True, return_tensors="pt")
 # Generate
 generate_ids = model.generate(**inputs, max_length=50)