| import decord | |
| import random | |
| import numpy as np | |
| from PIL import Image | |
| import torch | |
| from torchvision.transforms import Normalize, Compose, InterpolationMode, ToTensor, Resize | |
| def _convert_to_rgb(image): | |
| return image.convert('RGB') | |
| def image_transform(image_size: int): | |
| mean = (0.48145466, 0.4578275, 0.40821073) | |
| std = (0.26862954, 0.26130258, 0.27577711) | |
| normalize = Normalize(mean=mean, std=std) | |
| transforms = [ | |
| Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC), | |
| _convert_to_rgb, | |
| ToTensor(), | |
| normalize, | |
| ] | |
| return Compose(transforms) | |
| def preprocess_multimodal(sources, num_segments): | |
| for source in sources: | |
| for sentence in source: | |
| X_token = '<video>' | |
| if X_token in sentence['content']: | |
| replace_token = "" | |
| ns = num_segments | |
| ns = ns // 2 - 1 | |
| for _ in range(ns): | |
| replace_token += "<image>" | |
| replace_token += "<eof>" | |
| replace_token += "<image>" | |
| replace_token += "<eov>" | |
| replace_token = '<vi_start>' + replace_token + '<vi_end>' | |
| sentence["content"] = sentence["content"].replace(X_token, replace_token) | |
| return sources | |
| def preprocess( | |
| sources, | |
| tokenizer, | |
| s_id=None, | |
| ): | |
| en_qa_templates = [ | |
| "Review the given video and answer the question associated with its visual elements.", | |
| "Watch the provided video and offer an accurate response to the related question.", | |
| "Scrutinize the video carefully, identifying relevant details in order to address the linked question.", | |
| "Take a close look at the presented visuals and deliver a precise answer to the corresponding question.", | |
| "Observe the video attentively and accurately respond to the associated question.", | |
| "View the video attentively and provide a suitable answer to the posed question.", | |
| "Examine the video and approach the connected question with an informed response.", | |
| "Assess the displayed video and answer the subsequent question with accuracy.", | |
| "Consider the video content and deliver a relevant answer to the corresponding question.", | |
| "Go through the video, taking into account key aspects, and respond to the question." | |
| ] | |
| ch_qa_templates = [ | |
| "审阅所提供的视频,并回答与其视觉元素相关的问题。", | |
| "观看所提供的视频,对相关问题给出准确的回答。", | |
| "仔细审查视频,识别相关的细节,回答与之相关的问题。", | |
| "仔细观察所展示的视觉内容,并对相应的问题给出精确的回答。", | |
| "认真观察视频并准确回答相关的问题。", | |
| "详细观看视频,并且对提出的问题给出合适的回答。", | |
| "观察视频并用有依据的回答来解答相关的问题。", | |
| "评估展示的视频,并准确地回答随后的问题。", | |
| "根据视频内容,对相应的问题给出合理的答案。", | |
| "浏览视频,根据其中的关键内容回答问题。", | |
| ] | |
| if s_id != None: | |
| index = s_id | |
| else: | |
| index = random.choice(range(len(en_qa_templates))) | |
| system_prompt = f"""You are a helpful assistant, {en_qa_templates[index]} 你是一个乐于助人的助手,{ch_qa_templates[index]}""" | |
| messages = [] | |
| for source in sources: | |
| message = [{'role': 'system', 'content': system_prompt}] | |
| for sentence in source: | |
| message.append(sentence) | |
| messages.append(message) | |
| input_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors='pt') | |
| return input_ids | |
| def get_index(fps, max_frame, num_segments): | |
| num_frames = max_frame | |
| if num_frames <= num_segments: | |
| out_indices = np.array([(idx % num_frames) for idx in range(num_segments)]) | |
| out_indices = np.sort(out_indices) | |
| else: | |
| out_indices = np.linspace(0, num_frames-1, num_segments) | |
| durations = [idx.item() / fps for idx in out_indices] | |
| return out_indices.astype(np.int64), durations | |
| def read_video(video_path, num_segments): | |
| image_processor = image_transform(image_size=448) | |
| vr = decord.VideoReader(video_path) | |
| fps = float(vr.get_avg_fps()) | |
| frame_indices, durations = get_index(fps, len(vr) - 1, num_segments) | |
| video = [] | |
| for frame_index in frame_indices: | |
| image = Image.fromarray(vr[frame_index].asnumpy()) | |
| video.append(image_processor(image).unsqueeze(0)) | |
| video = torch.concat(video) | |
| return video, torch.Tensor(durations) | |
| def get_input(video_path, num_segments, question, history, tokenizer, s_id): | |
| video, durations = read_video(video_path, num_segments) | |
| if history == None: | |
| conversations = [] | |
| conversations.append({'role': 'user', 'content': f'<video>\n{question}'}) | |
| else: | |
| conversations = history | |
| conversations.append({'role': 'user', 'content': question}) | |
| sources = [conversations] | |
| sources = preprocess_multimodal(sources, video.shape[0]) | |
| input_ids = preprocess(sources, tokenizer, s_id=s_id) | |
| return video, durations, input_ids, conversations | |
| def add_pred_to_history(history, pred): | |
| history.append({'role': 'assistant', 'content': pred}) | |
| return history | |