Update README.md
Browse files
README.md
CHANGED
|
@@ -94,9 +94,6 @@ def _read_video_pyav(
|
|
| 94 |
frames.append(frame)
|
| 95 |
return np.stack([x.to_ndarray(format="rgb24") for x in frames])
|
| 96 |
|
| 97 |
-
## in VideoScore-v1.1, we support 48 frames in inference
|
| 98 |
-
MAX_NUM_FRAMES=48
|
| 99 |
-
|
| 100 |
ROUND_DIGIT=3
|
| 101 |
REGRESSION_QUERY_PROMPT = """
|
| 102 |
Suppose you are an expert in judging and evaluating the quality of AI-generated videos,
|
|
@@ -119,13 +116,24 @@ factual consistency: 1.8
|
|
| 119 |
For this video, the text prompt is "{text_prompt}",
|
| 120 |
all the frames of video are as follows:
|
| 121 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
model_name="TIGER-Lab/VideoScore-v1.1"
|
|
|
|
| 123 |
video_path="video1.mp4"
|
| 124 |
video_prompt="Near the Elephant Gate village, they approach the haunted house at night. Rajiv feels anxious, but Bhavesh encourages him. As they reach the house, a mysterious sound in the air adds to the suspense."
|
|
|
|
| 125 |
processor = AutoProcessor.from_pretrained(model_name,torch_dtype=torch.bfloat16)
|
| 126 |
model = Idefics2ForSequenceClassification.from_pretrained(model_name,torch_dtype=torch.bfloat16).eval()
|
| 127 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 128 |
model.to(device)
|
|
|
|
| 129 |
# sample uniformly 8 frames from the video
|
| 130 |
container = av.open(video_path)
|
| 131 |
total_frames = container.streams.video[0].frames
|
|
@@ -133,6 +141,7 @@ if total_frames > MAX_NUM_FRAMES:
|
|
| 133 |
indices = np.arange(0, total_frames, total_frames / MAX_NUM_FRAMES).astype(int)
|
| 134 |
else:
|
| 135 |
indices = np.arange(total_frames)
|
|
|
|
| 136 |
frames = [Image.fromarray(x) for x in _read_video_pyav(container, indices)]
|
| 137 |
eval_prompt = REGRESSION_QUERY_PROMPT.format(text_prompt=video_prompt)
|
| 138 |
num_image_token = eval_prompt.count("<image>")
|
|
@@ -144,9 +153,11 @@ for x in [frames]:
|
|
| 144 |
flatten_images.extend(x)
|
| 145 |
else:
|
| 146 |
flatten_images.append(x)
|
|
|
|
| 147 |
flatten_images = [Image.open(x) if isinstance(x, str) else x for x in flatten_images]
|
| 148 |
inputs = processor(text=eval_prompt, images=flatten_images, return_tensors="pt")
|
| 149 |
inputs = {k: v.to(model.device) for k, v in inputs.items()}
|
|
|
|
| 150 |
with torch.no_grad():
|
| 151 |
outputs = model(**inputs)
|
| 152 |
logits = outputs.logits
|
|
@@ -154,10 +165,15 @@ num_aspects = logits.shape[-1]
|
|
| 154 |
aspect_scores = []
|
| 155 |
for i in range(num_aspects):
|
| 156 |
aspect_scores.append(round(logits[0, i].item(),ROUND_DIGIT))
|
|
|
|
| 157 |
print(aspect_scores)
|
| 158 |
"""
|
| 159 |
model output on visual quality, temporal consistency, dynamic degree,
|
| 160 |
text-to-video alignment, factual consistency, respectively
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
[2.328, 2.484, 2.562, 1.969, 2.594]
|
| 162 |
"""
|
| 163 |
```
|
|
|
|
| 94 |
frames.append(frame)
|
| 95 |
return np.stack([x.to_ndarray(format="rgb24") for x in frames])
|
| 96 |
|
|
|
|
|
|
|
|
|
|
| 97 |
ROUND_DIGIT=3
|
| 98 |
REGRESSION_QUERY_PROMPT = """
|
| 99 |
Suppose you are an expert in judging and evaluating the quality of AI-generated videos,
|
|
|
|
| 116 |
For this video, the text prompt is "{text_prompt}",
|
| 117 |
all the frames of video are as follows:
|
| 118 |
"""
|
| 119 |
+
|
| 120 |
+
# MAX_NUM_FRAMES=16
|
| 121 |
+
# model_name="TIGER-Lab/VideoScore"
|
| 122 |
+
|
| 123 |
+
# =======================================
|
| 124 |
+
# we support 48 frames in VideoScore-v1.1
|
| 125 |
+
# =======================================
|
| 126 |
+
MAX_NUM_FRAMES=48
|
| 127 |
model_name="TIGER-Lab/VideoScore-v1.1"
|
| 128 |
+
|
| 129 |
video_path="video1.mp4"
|
| 130 |
video_prompt="Near the Elephant Gate village, they approach the haunted house at night. Rajiv feels anxious, but Bhavesh encourages him. As they reach the house, a mysterious sound in the air adds to the suspense."
|
| 131 |
+
|
| 132 |
processor = AutoProcessor.from_pretrained(model_name,torch_dtype=torch.bfloat16)
|
| 133 |
model = Idefics2ForSequenceClassification.from_pretrained(model_name,torch_dtype=torch.bfloat16).eval()
|
| 134 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 135 |
model.to(device)
|
| 136 |
+
|
| 137 |
# sample uniformly 8 frames from the video
|
| 138 |
container = av.open(video_path)
|
| 139 |
total_frames = container.streams.video[0].frames
|
|
|
|
| 141 |
indices = np.arange(0, total_frames, total_frames / MAX_NUM_FRAMES).astype(int)
|
| 142 |
else:
|
| 143 |
indices = np.arange(total_frames)
|
| 144 |
+
|
| 145 |
frames = [Image.fromarray(x) for x in _read_video_pyav(container, indices)]
|
| 146 |
eval_prompt = REGRESSION_QUERY_PROMPT.format(text_prompt=video_prompt)
|
| 147 |
num_image_token = eval_prompt.count("<image>")
|
|
|
|
| 153 |
flatten_images.extend(x)
|
| 154 |
else:
|
| 155 |
flatten_images.append(x)
|
| 156 |
+
|
| 157 |
flatten_images = [Image.open(x) if isinstance(x, str) else x for x in flatten_images]
|
| 158 |
inputs = processor(text=eval_prompt, images=flatten_images, return_tensors="pt")
|
| 159 |
inputs = {k: v.to(model.device) for k, v in inputs.items()}
|
| 160 |
+
|
| 161 |
with torch.no_grad():
|
| 162 |
outputs = model(**inputs)
|
| 163 |
logits = outputs.logits
|
|
|
|
| 165 |
aspect_scores = []
|
| 166 |
for i in range(num_aspects):
|
| 167 |
aspect_scores.append(round(logits[0, i].item(),ROUND_DIGIT))
|
| 168 |
+
|
| 169 |
print(aspect_scores)
|
| 170 |
"""
|
| 171 |
model output on visual quality, temporal consistency, dynamic degree,
|
| 172 |
text-to-video alignment, factual consistency, respectively
|
| 173 |
+
VideoScore:
|
| 174 |
+
[2.297, 2.469, 2.906, 2.766, 2.516]
|
| 175 |
+
|
| 176 |
+
VideoScore-v1.1:
|
| 177 |
[2.328, 2.484, 2.562, 1.969, 2.594]
|
| 178 |
"""
|
| 179 |
```
|