AtAndDev
/

UVOX-Qwen3-4B-Instruct-2507

@@ -33,13 +33,10 @@ SHARED_PRETRAINED_KWARGS = [
 class UltravoxModel(transformers.LlamaPreTrainedModel, GenerationMixin):
     """
     The Ultravox model which consists of an audio encoder and a language model.
     Audio input is processed by the audio encoder, then every `stack_factor` frames are stacked together and
     projected to the language model's embedding space using a few linear layers.
     The text is embedded by the language model as usual and then the audio and text embeddings are merged together.
     A special token `<|audio|>` is used to indicate the start of the audio embeddings in the merged embeddings.
     Parameters:
         config: Model configuration class with all the parameters of the model.
     """
@@ -59,11 +56,11 @@ class UltravoxModel(transformers.LlamaPreTrainedModel, GenerationMixin):
         self.keep_params: Set[str] = set()
         self.vocab_size = config.vocab_size
-        if not config.llm_only_training:
-            self.audio_tower = self._create_audio_tower(config)
-            self.multi_modal_projector = self._create_multi_modal_projector(config)
-            self.audio_tower_context_length = self.audio_tower.max_context_length
         self.language_model = self._create_language_model(config)
         if self.language_model._tied_weights_keys is not None:
@@ -72,16 +69,9 @@ class UltravoxModel(transformers.LlamaPreTrainedModel, GenerationMixin):
             ]
         # Determine no_split_modules dynamically to use with FSDP auto_wrap policy.
         # This would be something like ["LlamaDecoderLayer"] as we don't split audio encoder layers.
-        # FSDP throws an error if some of the layer types are not found in the model, and they need to be filted out.
-        # 1. Get the names the language model *wants* to keep intact
-        candidate_names = set(
-            getattr(self.language_model, "_no_split_modules", []) or []
-        )
-        # 2. Names that actually exist in the current model
-        present_names = {m.__class__.__name__ for m in self.modules()}
-        # 3. Keep only those that are both requested and present
-        self._no_split_modules = list(candidate_names & present_names)
         self.loss_config = LossConfig()
         self.post_init()
@@ -159,17 +149,13 @@ class UltravoxModel(transformers.LlamaPreTrainedModel, GenerationMixin):
         self, labels: Optional[torch.Tensor]
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """Get boolean masks for positions where we want to compute KL divergence.
         For each label position, we want the position before it since that's where
         the model makes the prediction for that label.
         Additionally, we want to identify the position right before the EOT token
         (the last token with label != -100).
         Args:
             labels: Tensor of shape (B, T) where B is batch size and T is sequence length,
                    with -100 for masked positions and token ids for label positions
         Returns:
             Tuple containing:
             - pred_mask: Boolean tensor of shape (B, T) that's True for positions where we want to compute KL divergence
@@ -239,32 +225,27 @@ class UltravoxModel(transformers.LlamaPreTrainedModel, GenerationMixin):
         )
         # Compute the KL divergence loss for EOT token positions if any exist
-        if self.loss_config.eot_loss_weight > 0:
-            eot_loss = F.kl_div(
-                F.log_softmax(
-                    lm_output.logits[eot_mask] / self.loss_config.kl_temperature,
-                    dim=-1,
-                ),
-                F.softmax(
-                    alt_lm_output.logits[alt_eot_mask]
-                    / self.loss_config.kl_temperature,
-                    dim=-1,
-                ),
-                reduction="batchmean",
-            )
-            kl_loss += self.loss_config.eot_loss_weight * eot_loss
-        return kl_loss
     def _audio_iter(
         self, audio_batch_size: torch.Tensor
     ) -> Generator[Tuple[int, int], None, None]:
         """
         Iterate over the audio batch size and yield the batch index and audio index of each audio item.
         Args:
             audio_batch_size: A tensor of shape (B,) where B is the batch size.
         Returns:
             A generator that yields a tuple of (start index, length) for each audio item.
         """
@@ -277,8 +258,8 @@ class UltravoxModel(transformers.LlamaPreTrainedModel, GenerationMixin):
     def forward(
         self,
         input_ids: torch.Tensor,
-        audio_values: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
         labels: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         audio_token_start_idx: Optional[torch.Tensor] = None,
@@ -291,16 +272,14 @@ class UltravoxModel(transformers.LlamaPreTrainedModel, GenerationMixin):
         alt_attention_mask: Optional[torch.Tensor] = None,
         alt_labels: Optional[torch.Tensor] = None,
         **kwargs,
-    ) -> transformers.modeling_outputs.CausalLMOutputWithPast:
         """
         Forward pass for the Ultravox model.
         `input_ids` are the tokenized text input. They are embedded by the language model as usual.
         `audio_values` are processed by the audio encoder and then every `stack_factor` frames are stacked together and
         projected to the language model's embedding space using a few linear layers.
         The audio and text embeddings are merged together. A special token `<|audio|>` is used to indicate the start
         of the audio embeddings in the merged embeddings.
         Args:
             input_ids: The tokenized text input.
             audio_values: The processed audio values.
@@ -316,14 +295,36 @@ class UltravoxModel(transformers.LlamaPreTrainedModel, GenerationMixin):
             inputs_embeds = self.get_input_embeddings().forward(input_ids)
         if audio_values is not None and len(audio_values) > 0:
-            inputs_embeds = self._prepare_audio_embeds(
-                inputs_embeds=inputs_embeds,
-                audio_values=audio_values,
-                audio_token_start_idx=audio_token_start_idx,
-                audio_lens=audio_lens,
-                audio_token_len=audio_token_len,
-                audio_batch_size=audio_batch_size,
-            )
         lm_output = self.language_model.forward(
             inputs_embeds=inputs_embeds,
@@ -334,9 +335,9 @@ class UltravoxModel(transformers.LlamaPreTrainedModel, GenerationMixin):
         )
         if self.training:
             if self.loss_config.loss_function == LossFunction.CrossEntropy:
-                pass
             elif self.loss_config.loss_function == LossFunction.KL_Divergence:
-                lm_output.loss = self._compute_kl_loss(
                     lm_output=lm_output,
                     labels=labels,
                     past_key_values=past_key_values,
@@ -349,82 +350,52 @@ class UltravoxModel(transformers.LlamaPreTrainedModel, GenerationMixin):
                 raise ValueError(
                     f"Unsupported loss function: {self.loss_config.loss_function}"
                 )
-        return lm_output
-    def _prepare_audio_embeds(
-        self,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        audio_values: Optional[torch.Tensor] = None,
-        audio_token_start_idx: Optional[torch.Tensor] = None,
-        audio_lens: Optional[torch.Tensor] = None,
-        audio_token_len: Optional[torch.Tensor] = None,
-        audio_batch_size: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        assert (
-            inputs_embeds is not None
-            and audio_values is not None
-            and audio_token_start_idx is not None
-            and audio_token_len is not None
-            and audio_lens is not None
-            and audio_batch_size is not None
-        ), "inputs_embeds/audio_values/audio_token_start_idx/audio_token_len/audio_lens/audio_batch_size must be provided."
-        assert (
-            len(audio_token_start_idx)
-            == len(audio_token_len)
-            == len(audio_lens)
-            == len(audio_values)
-        ), "audio_token_start_idx/audio_token_len/audio_lens/audio_values must have the same batch size."
-        assert len(audio_batch_size) == len(
-            inputs_embeds
-        ), "audio_batch_size and inputs_embeds must have the same batch size."
-        # B x A/3200 x (D=max-audio-length-in-batch)
-        audio_tower_output = self.audio_tower.forward(
-            audio_values.to(self.audio_tower.dtype),
-            audio_len=audio_lens,
-        ).last_hidden_state
-        audio_tower_output = audio_tower_output.to(inputs_embeds.dtype)
-        audio_embeds = self.multi_modal_projector.forward(audio_tower_output)
-        # combine audio and text embeddings
-        for i_b, i_a in self._audio_iter(audio_batch_size):
-            start_idx = audio_token_start_idx[i_a]
-            token_len = audio_token_len[i_a]
-            item_embedding = audio_embeds[i_a][:token_len]
-            inputs_embeds[i_b][start_idx : start_idx + token_len] = item_embedding
-        return inputs_embeds
-    def generate(
         self,
         input_ids: torch.Tensor,
-        audio_values: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
         audio_token_start_idx: Optional[torch.Tensor] = None,
-        audio_lens: Optional[torch.Tensor] = None,
         audio_token_len: Optional[torch.Tensor] = None,
         audio_batch_size: Optional[torch.Tensor] = None,
         **kwargs,
-    ) -> torch.Tensor:
-        if inputs_embeds is None:
-            inputs_embeds = self.get_input_embeddings().forward(input_ids)
-        if audio_values is not None and len(audio_values) > 0:
-            inputs_embeds = self._prepare_audio_embeds(
-                inputs_embeds=inputs_embeds,
-                audio_values=audio_values,
-                audio_token_start_idx=audio_token_start_idx,
-                audio_lens=audio_lens,
-                audio_token_len=audio_token_len,
-                audio_batch_size=audio_batch_size,
-            )
-        return self.language_model.generate(
             input_ids=input_ids,
             inputs_embeds=inputs_embeds,
             **kwargs,
         )
     @classmethod
     def _create_multi_modal_projector(
         cls, config: UltravoxConfig
@@ -454,9 +425,6 @@ class UltravoxModel(transformers.LlamaPreTrainedModel, GenerationMixin):
                 audio_tower.init_latency_mask(
                     config.audio_latency_block_size, dtype=config.torch_dtype
                 )
-                audio_tower.init_latency_mask(
-                    config.audio_latency_block_size, dtype=config.torch_dtype
-                )
             else:
                 assert config.audio_latency_block_size in (
                     None,
@@ -539,9 +507,7 @@ class UltravoxModel(transformers.LlamaPreTrainedModel, GenerationMixin):
                 )
             )
-        if hasattr(self, "audio_tower") and isinstance(
-            self.audio_tower, peft.PeftModel
-        ):
             self.audio_tower = self.audio_tower.merge_and_unload()
             # no need to download base audio model weights anymore, so we can remove the id
             self.config.audio_model_id = None
@@ -607,33 +573,18 @@ class UltravoxModel(transformers.LlamaPreTrainedModel, GenerationMixin):
         )
         lm_trainable_params, lm_all_params = count_params(self.language_model)
-        if hasattr(self, "audio_tower") and self.audio_tower is not None:
-            audio_trainable_params, audio_all_params = count_params(self.audio_tower)
-        else:
-            audio_trainable_params, audio_all_params = 0, 0
         projector_trainable_params = (
             trainable_params - lm_trainable_params - audio_trainable_params
         )
         projector_all_params = all_param - lm_all_params - audio_all_params
-        # Calculate percentages only if the total parameters are non-zero
-        audio_percent = (
-            0.0
-            if audio_all_params == 0
-            else 100 * audio_trainable_params / audio_all_params
-        )
-        projector_percent = (
-            0.0
-            if projector_all_params == 0
-            else 100 * projector_trainable_params / projector_all_params
-        )
         logging.info(
             f"Trainable%:   "
             f" LLM: {100 * lm_trainable_params / lm_all_params:.1f}%"
-            f" || Audio Encoder: {audio_percent:.1f}%"
-            f" || Projector: {projector_percent:.1f}%"
         )
@@ -770,7 +721,6 @@ class UltravoxProjector(nn.Module):
         Takes in audio features from the audio tower and projects them to the text model's embedding space.
         It reduces the number of frames by a factor of `stack_factor` and increases the number of channels by the same factor.
         If the number of audio frames are not a multiple of the stack factor, the last few frames will be padded with zeros.
         Input shape:
             audio_features: B, T*S, C
         Output shape:
@@ -784,7 +734,6 @@ class UltravoxProjector(nn.Module):
             C: number of channels out of the encoder (aka audio tower)
             H: hidden size of the projector (config.hidden_size)
             D: dimension of the text model (config.text_config.hidden_size)
         """
         # B, F, C -> B, T, C*S
         audio_features = self._pad_and_stack(audio_features)
@@ -805,13 +754,11 @@ class ModifiedWhisperEncoder(
 ):
     """
     Encoder portion of OpenAI's Whisper model.
     This implementation is a slightly modified version of HF Transformers' Whisper Encoder, with only a few fixes:
     1. base_model_prefix updated to allow for doing `.from_pretrained` directly on the encoder
     2. allow less than 30 second of audio padding to be passed in:
         - relaxed ValueError check for `input_features` length to be less than or equal to `expected_seq_length` instead of strictly equal
         - embed_pos is now sliced to match the length of `inputs_embeds`
     Original: https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py
     """
@@ -913,7 +860,7 @@ class ModifiedWhisperEncoder(
         # This masking ensures consistent behavior between training and inference
         # by preventing the model from attending to padding tokens in both cases
         attention_mask = None
-        if audio_len is not None:
             audio_feature_len = self._get_feat_extract_output_lengths(audio_len)
             max_seq_len = hidden_states.shape[1]
             attention_mask = torch.arange(max_seq_len, device=hidden_states.device)[

 class UltravoxModel(transformers.LlamaPreTrainedModel, GenerationMixin):
     """
     The Ultravox model which consists of an audio encoder and a language model.
     Audio input is processed by the audio encoder, then every `stack_factor` frames are stacked together and
     projected to the language model's embedding space using a few linear layers.
     The text is embedded by the language model as usual and then the audio and text embeddings are merged together.
     A special token `<|audio|>` is used to indicate the start of the audio embeddings in the merged embeddings.
     Parameters:
         config: Model configuration class with all the parameters of the model.
     """
         self.keep_params: Set[str] = set()
         self.vocab_size = config.vocab_size
+        self.audio_tower = self._create_audio_tower(config)
+        self.audio_tower_context_length: Optional[int] = None
+        self.audio_tower_context_length = self.audio_tower.max_context_length
+        self.multi_modal_projector = self._create_multi_modal_projector(config)
         self.language_model = self._create_language_model(config)
         if self.language_model._tied_weights_keys is not None:
             ]
         # Determine no_split_modules dynamically to use with FSDP auto_wrap policy.
+        # FSDP throws an error if some of the layer types are not found in the model.
         # This would be something like ["LlamaDecoderLayer"] as we don't split audio encoder layers.
+        self._no_split_modules = self.language_model._no_split_modules
         self.loss_config = LossConfig()
         self.post_init()
         self, labels: Optional[torch.Tensor]
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """Get boolean masks for positions where we want to compute KL divergence.
         For each label position, we want the position before it since that's where
         the model makes the prediction for that label.
         Additionally, we want to identify the position right before the EOT token
         (the last token with label != -100).
         Args:
             labels: Tensor of shape (B, T) where B is batch size and T is sequence length,
                    with -100 for masked positions and token ids for label positions
         Returns:
             Tuple containing:
             - pred_mask: Boolean tensor of shape (B, T) that's True for positions where we want to compute KL divergence
         )
         # Compute the KL divergence loss for EOT token positions if any exist
+        eot_loss = F.kl_div(
+            F.log_softmax(
+                lm_output.logits[eot_mask] / self.loss_config.kl_temperature,
+                dim=-1,
+            ),
+            F.softmax(
+                alt_lm_output.logits[alt_eot_mask] / self.loss_config.kl_temperature,
+                dim=-1,
+            ),
+            reduction="batchmean",
+        )
+        return {"loss": kl_loss + self.loss_config.eot_loss_weight * eot_loss}
     def _audio_iter(
         self, audio_batch_size: torch.Tensor
     ) -> Generator[Tuple[int, int], None, None]:
         """
         Iterate over the audio batch size and yield the batch index and audio index of each audio item.
         Args:
             audio_batch_size: A tensor of shape (B,) where B is the batch size.
         Returns:
             A generator that yields a tuple of (start index, length) for each audio item.
         """
     def forward(
         self,
         input_ids: torch.Tensor,
+        audio_values: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         audio_token_start_idx: Optional[torch.Tensor] = None,
         alt_attention_mask: Optional[torch.Tensor] = None,
         alt_labels: Optional[torch.Tensor] = None,
         **kwargs,
+    ) -> Union[Tuple, transformers.modeling_outputs.CausalLMOutputWithPast]:
         """
         Forward pass for the Ultravox model.
         `input_ids` are the tokenized text input. They are embedded by the language model as usual.
         `audio_values` are processed by the audio encoder and then every `stack_factor` frames are stacked together and
         projected to the language model's embedding space using a few linear layers.
         The audio and text embeddings are merged together. A special token `<|audio|>` is used to indicate the start
         of the audio embeddings in the merged embeddings.
         Args:
             input_ids: The tokenized text input.
             audio_values: The processed audio values.
             inputs_embeds = self.get_input_embeddings().forward(input_ids)
         if audio_values is not None and len(audio_values) > 0:
+            assert (
+                audio_token_start_idx is not None
+                and audio_token_len is not None
+                and audio_lens is not None
+                and audio_batch_size is not None
+            ), "audio_token_start_idx/audio_token_len/audio_lens must be provided if audio_values are provided."
+            assert (
+                len(audio_token_start_idx)
+                == len(audio_token_len)
+                == len(audio_lens)
+                == len(audio_values)
+            ), "audio_token_start_idx/audio_token_len/audio_lens/audio_values must have the same batch size."
+            assert len(audio_batch_size) == len(
+                inputs_embeds
+            ), "audio_batch_size and inputs_embeds must have the same batch size."
+            # B x A/3200 x (D=max-audio-length-in-batch)
+            audio_tower_output = self.audio_tower.forward(
+                audio_values.to(self.audio_tower.dtype),
+                audio_len=audio_lens,
+            ).last_hidden_state
+            audio_tower_output = audio_tower_output.to(inputs_embeds.dtype)
+            audio_embeds = self.multi_modal_projector.forward(audio_tower_output)
+            # combine audio and text embeddings
+            for i_b, i_a in self._audio_iter(audio_batch_size):
+                start_idx = audio_token_start_idx[i_a]
+                token_len = audio_token_len[i_a]
+                item_embedding = audio_embeds[i_a][:token_len]
+                inputs_embeds[i_b][start_idx : start_idx + token_len] = item_embedding
         lm_output = self.language_model.forward(
             inputs_embeds=inputs_embeds,
         )
         if self.training:
             if self.loss_config.loss_function == LossFunction.CrossEntropy:
+                return lm_output
             elif self.loss_config.loss_function == LossFunction.KL_Divergence:
+                return self._compute_kl_loss(
                     lm_output=lm_output,
                     labels=labels,
                     past_key_values=past_key_values,
                 raise ValueError(
                     f"Unsupported loss function: {self.loss_config.loss_function}"
                 )
+        else:
+            return lm_output
+    def prepare_inputs_for_generation(
         self,
         input_ids: torch.Tensor,
+        audio_values: Optional[torch.FloatTensor] = None,
         audio_token_start_idx: Optional[torch.Tensor] = None,
         audio_token_len: Optional[torch.Tensor] = None,
+        audio_lens: Optional[torch.Tensor] = None,
         audio_batch_size: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        cache_position: Optional[torch.Tensor] = None,
         **kwargs,
+    ) -> Dict[str, Any]:
+        model_input = self.language_model.prepare_inputs_for_generation(
             input_ids=input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
             inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
             **kwargs,
         )
+        # include audio information in model_input only when it is needed during prefilling
+        # audio_token_start_idx should always be relative to the current cache position
+        prefill_start_idx: int | torch.Tensor = (
+            0 if cache_position is None else cache_position[0]
+        )
+        if (
+            audio_values is not None
+            and audio_token_start_idx is not None
+            and prefill_start_idx <= torch.max(audio_token_start_idx)
+        ):
+            model_input["audio_values"] = audio_values
+            model_input["audio_token_start_idx"] = (
+                audio_token_start_idx - prefill_start_idx
+            )
+            model_input["audio_token_len"] = audio_token_len
+            model_input["audio_batch_size"] = audio_batch_size
+            model_input["audio_lens"] = audio_lens
+        return model_input
     @classmethod
     def _create_multi_modal_projector(
         cls, config: UltravoxConfig
                 audio_tower.init_latency_mask(
                     config.audio_latency_block_size, dtype=config.torch_dtype
                 )
             else:
                 assert config.audio_latency_block_size in (
                     None,
                 )
             )
+        if isinstance(self.audio_tower, peft.PeftModel):
             self.audio_tower = self.audio_tower.merge_and_unload()
             # no need to download base audio model weights anymore, so we can remove the id
             self.config.audio_model_id = None
         )
         lm_trainable_params, lm_all_params = count_params(self.language_model)
+        audio_trainable_params, audio_all_params = count_params(self.audio_tower)
         projector_trainable_params = (
             trainable_params - lm_trainable_params - audio_trainable_params
         )
         projector_all_params = all_param - lm_all_params - audio_all_params
         logging.info(
             f"Trainable%:   "
             f" LLM: {100 * lm_trainable_params / lm_all_params:.1f}%"
+            f" || Audio Encoder: {100 * audio_trainable_params / audio_all_params:.1f}%"
+            f" || Projector: {100 * projector_trainable_params / projector_all_params:.1f}%"
         )
         Takes in audio features from the audio tower and projects them to the text model's embedding space.
         It reduces the number of frames by a factor of `stack_factor` and increases the number of channels by the same factor.
         If the number of audio frames are not a multiple of the stack factor, the last few frames will be padded with zeros.
         Input shape:
             audio_features: B, T*S, C
         Output shape:
             C: number of channels out of the encoder (aka audio tower)
             H: hidden size of the projector (config.hidden_size)
             D: dimension of the text model (config.text_config.hidden_size)
         """
         # B, F, C -> B, T, C*S
         audio_features = self._pad_and_stack(audio_features)
 ):
     """
     Encoder portion of OpenAI's Whisper model.
     This implementation is a slightly modified version of HF Transformers' Whisper Encoder, with only a few fixes:
     1. base_model_prefix updated to allow for doing `.from_pretrained` directly on the encoder
     2. allow less than 30 second of audio padding to be passed in:
         - relaxed ValueError check for `input_features` length to be less than or equal to `expected_seq_length` instead of strictly equal
         - embed_pos is now sliced to match the length of `inputs_embeds`
     Original: https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py
     """
         # This masking ensures consistent behavior between training and inference
         # by preventing the model from attending to padding tokens in both cases
         attention_mask = None
+        if audio_len != None:
             audio_feature_len = self._get_feat_extract_output_lengths(audio_len)
             max_seq_len = hidden_states.shape[1]
             attention_mask = torch.arange(max_seq_len, device=hidden_states.device)[