TIGER-Lab
/

VideoScore-v1.1

@@ -94,9 +94,6 @@ def _read_video_pyav(
             frames.append(frame)
     return np.stack([x.to_ndarray(format="rgb24") for x in frames])
-## in VideoScore-v1.1, we support 48 frames in inference
-MAX_NUM_FRAMES=48
 ROUND_DIGIT=3
 REGRESSION_QUERY_PROMPT = """
 Suppose you are an expert in judging and evaluating the quality of AI-generated videos,
@@ -119,13 +116,24 @@ factual consistency: 1.8
 For this video, the text prompt is "{text_prompt}",
 all the frames of video are as follows:
 """
 model_name="TIGER-Lab/VideoScore-v1.1"
 video_path="video1.mp4"
 video_prompt="Near the Elephant Gate village, they approach the haunted house at night. Rajiv feels anxious, but Bhavesh encourages him. As they reach the house, a mysterious sound in the air adds to the suspense."
 processor = AutoProcessor.from_pretrained(model_name,torch_dtype=torch.bfloat16)
 model = Idefics2ForSequenceClassification.from_pretrained(model_name,torch_dtype=torch.bfloat16).eval()
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
 # sample uniformly 8 frames from the video
 container = av.open(video_path)
 total_frames = container.streams.video[0].frames
@@ -133,6 +141,7 @@ if total_frames > MAX_NUM_FRAMES:
     indices = np.arange(0, total_frames, total_frames / MAX_NUM_FRAMES).astype(int)
 else:
     indices = np.arange(total_frames)
 frames = [Image.fromarray(x) for x in _read_video_pyav(container, indices)]
 eval_prompt = REGRESSION_QUERY_PROMPT.format(text_prompt=video_prompt)
 num_image_token = eval_prompt.count("<image>")
@@ -144,9 +153,11 @@ for x in [frames]:
         flatten_images.extend(x)
     else:
         flatten_images.append(x)
 flatten_images = [Image.open(x) if isinstance(x, str) else x for x in flatten_images]
 inputs = processor(text=eval_prompt, images=flatten_images, return_tensors="pt")
 inputs = {k: v.to(model.device) for k, v in inputs.items()}
 with torch.no_grad():
     outputs = model(**inputs)
 logits = outputs.logits
@@ -154,10 +165,15 @@ num_aspects = logits.shape[-1]
 aspect_scores = []
 for i in range(num_aspects):
     aspect_scores.append(round(logits[0, i].item(),ROUND_DIGIT))
 print(aspect_scores)
 """
 model output on visual quality, temporal consistency, dynamic degree,
 text-to-video alignment, factual consistency, respectively
 [2.328, 2.484, 2.562, 1.969, 2.594]
 """
 ```

             frames.append(frame)
     return np.stack([x.to_ndarray(format="rgb24") for x in frames])
 ROUND_DIGIT=3
 REGRESSION_QUERY_PROMPT = """
 Suppose you are an expert in judging and evaluating the quality of AI-generated videos,
 For this video, the text prompt is "{text_prompt}",
 all the frames of video are as follows:
 """
+# MAX_NUM_FRAMES=16
+# model_name="TIGER-Lab/VideoScore"
+# =======================================
+# we support 48 frames in VideoScore-v1.1
+# =======================================
+MAX_NUM_FRAMES=48
 model_name="TIGER-Lab/VideoScore-v1.1"
 video_path="video1.mp4"
 video_prompt="Near the Elephant Gate village, they approach the haunted house at night. Rajiv feels anxious, but Bhavesh encourages him. As they reach the house, a mysterious sound in the air adds to the suspense."
 processor = AutoProcessor.from_pretrained(model_name,torch_dtype=torch.bfloat16)
 model = Idefics2ForSequenceClassification.from_pretrained(model_name,torch_dtype=torch.bfloat16).eval()
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
 # sample uniformly 8 frames from the video
 container = av.open(video_path)
 total_frames = container.streams.video[0].frames
     indices = np.arange(0, total_frames, total_frames / MAX_NUM_FRAMES).astype(int)
 else:
     indices = np.arange(total_frames)
 frames = [Image.fromarray(x) for x in _read_video_pyav(container, indices)]
 eval_prompt = REGRESSION_QUERY_PROMPT.format(text_prompt=video_prompt)
 num_image_token = eval_prompt.count("<image>")
         flatten_images.extend(x)
     else:
         flatten_images.append(x)
 flatten_images = [Image.open(x) if isinstance(x, str) else x for x in flatten_images]
 inputs = processor(text=eval_prompt, images=flatten_images, return_tensors="pt")
 inputs = {k: v.to(model.device) for k, v in inputs.items()}
 with torch.no_grad():
     outputs = model(**inputs)
 logits = outputs.logits
 aspect_scores = []
 for i in range(num_aspects):
     aspect_scores.append(round(logits[0, i].item(),ROUND_DIGIT))
 print(aspect_scores)
 """
 model output on visual quality, temporal consistency, dynamic degree,
 text-to-video alignment, factual consistency, respectively
+VideoScore:
+[2.297, 2.469, 2.906, 2.766, 2.516]
+VideoScore-v1.1:
 [2.328, 2.484, 2.562, 1.969, 2.594]
 """
 ```