# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import json import math import os import os.path as osp from copy import deepcopy from threading import Thread from typing import List, Optional import torch import torchvision from PIL import Image from transformers import ( AutoProcessor, PretrainedConfig, PreTrainedModel, Qwen2Config, Qwen2ForCausalLM, Qwen2PreTrainedModel, TextIteratorStreamer, ) class VILAConfig(PretrainedConfig): model_type = "vila" keys_to_ignore_at_inference = ["past_key_values"] def __init__( self, llm_cfg=None, vision_tower_cfg=None, mm_projector_cfg=None, speech_tower_cfg=None, sound_tower_cfg=None, speech_mm_projector_cfg=None, sound_mm_projector_cfg=None, architectures=None, resume_path=None, hidden_size=None, mm_hidden_size=None, image_aspect_ratio=None, num_video_frames=None, fps=None, mm_vision_select_layer=None, mm_vision_select_feature=None, mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_projector_lr=None, vision_tower_lr=None, vision_resolution=None, interpolate_mode=None, s2=None, dynamic_s2=None, s2_scales=None, s2_max_split_size=None, s2_resize_output_to_scale_idx=0, min_tiles: Optional[int] = 1, max_tiles: Optional[int] = 12, num_time_tokens=None, time_token_format=None, image_encoder: str = '{"_target_": "llava.model.encoders.BasicImageEncoder"}', video_encoder: str = '{"_target_": "llava.model.encoders.TSPVideoEncoder"}', sound_encoder: str = '{"_target_": "llava.model.encoders.BasicSoundEncoder"}', speech_encoder: str = '{"_target_": "llava.model.encoders.BasicSpeechEncoder"}', **kwargs, ): super().__init__(**kwargs) self.architectures = architectures self.llm_cfg = llm_cfg self.vision_tower_cfg = vision_tower_cfg self.mm_projector_cfg = mm_projector_cfg self.speech_tower_cfg = speech_tower_cfg self.sound_tower_cfg = sound_tower_cfg self.speech_mm_projector_cfg = speech_mm_projector_cfg self.sound_mm_projector_cfg = sound_mm_projector_cfg self.resume_path = resume_path self.hidden_size = hidden_size self.mm_hidden_size = mm_hidden_size self.image_aspect_ratio = image_aspect_ratio self.num_video_frames = num_video_frames self.fps = fps self.mm_vision_select_layer = mm_vision_select_layer self.mm_vision_select_feature = mm_vision_select_feature self.mm_use_im_start_end = mm_use_im_start_end self.mm_use_im_patch_token = mm_use_im_patch_token self.mm_projector_lr = mm_projector_lr self.vision_tower_lr = vision_tower_lr self.vision_resolution = vision_resolution self.interpolate_mode = interpolate_mode self.s2 = s2 self.dynamic_s2 = dynamic_s2 self.s2_scales = s2_scales self.s2_max_split_size = s2_max_split_size self.s2_resize_output_to_scale_idx = s2_resize_output_to_scale_idx self.min_tiles = min_tiles self.max_tiles = max_tiles self.num_time_tokens = num_time_tokens self.time_token_format = time_token_format self.image_encoder = image_encoder self.video_encoder = video_encoder self.sound_encoder = sound_encoder self.speech_encoder = speech_encoder self.audio_sampling_rate = 16000 self.audio_chunk_length = 120 self.interleaved_vis_aud_in_video = True self.interleaved_video_segment_duration = 30 self.audio_hop_length = 60 super().__init__(**kwargs)