AtAndDev
/

UVOX-40k-Llama-32-3B-Instruct

@@ -40,12 +40,12 @@ class DataCollatorForSeq2SeqWithAudio(transformers.DataCollatorForSeq2Seq):
             batch["alt_attention_mask"] = alt_batch["attention_mask"]
             batch["alt_labels"] = alt_batch["labels"]
-        # Only process audio fields if we have non-empty audio values
-        if audio_values and len(audio_values) > 0 and len(audio_values[0]) > 0:
-            batch["audio_token_start_idx"] = torch.stack(audio_token_start_idx)
-            batch["audio_lens"] = torch.stack(audio_lens)
-            batch["audio_token_len"] = torch.stack(audio_token_len)
-            # Pad the last dimension of all audio_values to the same length, with 0s on the right.
             max_len = max([x.shape[-1] for x in audio_values])
             batch["audio_values"] = torch.stack(
                 [F.pad(x, (0, max_len - x.shape[-1])) for x in audio_values]
@@ -67,7 +67,6 @@ class DataCollatorForSeq2SeqWithAudio(transformers.DataCollatorForSeq2Seq):
 class UltravoxProcessor(transformers.ProcessorMixin):
     """
     Constructs an Ultravox processor which wraps an audio processor and a tokenizer into a single processor.
     Args:
         audio_processor: The audio processor for the audio encoder.
         tokenizer: The tokenizer for the language model.
@@ -113,18 +112,10 @@ class UltravoxProcessor(transformers.ProcessorMixin):
             tokenizer.eos_token is not None
         ), "The tokenizer has no EOS token. Cannot recover."
         self.vocab = tokenizer.get_vocab()
-        # VLLM currently relies on updating audio_token_replacement, hence to be safe
-        # we should not update it. This dependency should be removed in the future.
         self.audio_token_replacement = tokenizer.eos_token
         if tokenizer.pad_token_id is None:
             tokenizer.pad_token_id = tokenizer.eos_token_id
-        # Use a dummy audio processor to satisfy the base class for text-only training
-        if audio_processor is None:
-            audio_processor = transformers.AutoProcessor.from_pretrained(
-                "openai/whisper-tiny"
-            )
         super().__init__(audio_processor=audio_processor, tokenizer=tokenizer)
     @classmethod
@@ -159,18 +150,15 @@ class UltravoxProcessor(transformers.ProcessorMixin):
         """
         Processes the audio batch by chunking any items in the batch according to the audio_context_size,
         padding the last chunk if needed, and returns a dictionary with updated audio data.
         Args:
             audio_values (torch.Tensor): A tensor of audio values (e.g., in B, D, T format).
             audio_lens (torch.Tensor): A tensor of audio lengths.
         Returns:
             Dict[str, Any]: Dictionary with the following keys:
                 - "audio_values": The concatenated audio tensor after chunking and padding.
                 - "audio_lens": Tensor of lengths for each chunk.
                 - "audio_is_continuation": Tensor of booleans indicating if the chunk is a continuation of the previous chunk.
                 - "audio_batch_size": A Tensor with one integer representing the number of chunks.
         """
         chunked_audio_values: List[torch.Tensor] = []
         chunked_audio_lens: List[int] = []
@@ -236,7 +224,6 @@ class UltravoxProcessor(transformers.ProcessorMixin):
         the text. To prepare the audio(s), this method forwards the `audio`, `sampling_rate` and `kwargs` arguments to
         audio processor's [`~WhisperProcessor.__call__`] if `audio` is not `None`. Please refer to the docstring
         of the above two methods for more information.
         Args:
             text (`str`, `List[str]`):
                 The sequence to be encoded. Sequence can be a string or (pretokenized string).
@@ -249,15 +236,12 @@ class UltravoxProcessor(transformers.ProcessorMixin):
                 you are doing.
             return_tensors (`str` or [`~utils.TensorType`], *optional*):
                 If set, will return tensors of a particular framework. Acceptable values are:
                 - `'tf'`: Return TensorFlow `tf.constant` objects.
                 - `'pt'`: Return PyTorch `torch.Tensor` objects.
                 - `'np'`: Return NumPy `np.ndarray` objects.
                 - `'jax'`: Return JAX `jnp.ndarray` objects.
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
             - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
             - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
               `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
@@ -335,7 +319,7 @@ class UltravoxProcessor(transformers.ProcessorMixin):
             split_input_ids = tokenized_parts["input_ids"]
             input_ids: List[int] = []
-            audio_replacement_token_id = self.vocab[self.audio_token_replacement]
             for i, token_len in enumerate(data.get("audio_token_len", [])):
                 if not audio_is_continuation[i]:
@@ -349,7 +333,7 @@ class UltravoxProcessor(transformers.ProcessorMixin):
                 audio_token_start_idx.append(len(input_ids))
-                input_ids.extend([audio_replacement_token_id] * token_len)
             # Include any tokens after the last audio.
             placeholder_index += 1

             batch["alt_attention_mask"] = alt_batch["attention_mask"]
             batch["alt_labels"] = alt_batch["labels"]
+        batch["audio_token_start_idx"] = torch.stack(audio_token_start_idx)
+        batch["audio_lens"] = torch.stack(audio_lens)
+        batch["audio_token_len"] = torch.stack(audio_token_len)
+        # Pad the last dimension of all audio_values to the same length, with 0s on the right.
+        if audio_values:
             max_len = max([x.shape[-1] for x in audio_values])
             batch["audio_values"] = torch.stack(
                 [F.pad(x, (0, max_len - x.shape[-1])) for x in audio_values]
 class UltravoxProcessor(transformers.ProcessorMixin):
     """
     Constructs an Ultravox processor which wraps an audio processor and a tokenizer into a single processor.
     Args:
         audio_processor: The audio processor for the audio encoder.
         tokenizer: The tokenizer for the language model.
             tokenizer.eos_token is not None
         ), "The tokenizer has no EOS token. Cannot recover."
         self.vocab = tokenizer.get_vocab()
         self.audio_token_replacement = tokenizer.eos_token
         if tokenizer.pad_token_id is None:
             tokenizer.pad_token_id = tokenizer.eos_token_id
         super().__init__(audio_processor=audio_processor, tokenizer=tokenizer)
     @classmethod
         """
         Processes the audio batch by chunking any items in the batch according to the audio_context_size,
         padding the last chunk if needed, and returns a dictionary with updated audio data.
         Args:
             audio_values (torch.Tensor): A tensor of audio values (e.g., in B, D, T format).
             audio_lens (torch.Tensor): A tensor of audio lengths.
         Returns:
             Dict[str, Any]: Dictionary with the following keys:
                 - "audio_values": The concatenated audio tensor after chunking and padding.
                 - "audio_lens": Tensor of lengths for each chunk.
                 - "audio_is_continuation": Tensor of booleans indicating if the chunk is a continuation of the previous chunk.
                 - "audio_batch_size": A Tensor with one integer representing the number of chunks.
         """
         chunked_audio_values: List[torch.Tensor] = []
         chunked_audio_lens: List[int] = []
         the text. To prepare the audio(s), this method forwards the `audio`, `sampling_rate` and `kwargs` arguments to
         audio processor's [`~WhisperProcessor.__call__`] if `audio` is not `None`. Please refer to the docstring
         of the above two methods for more information.
         Args:
             text (`str`, `List[str]`):
                 The sequence to be encoded. Sequence can be a string or (pretokenized string).
                 you are doing.
             return_tensors (`str` or [`~utils.TensorType`], *optional*):
                 If set, will return tensors of a particular framework. Acceptable values are:
                 - `'tf'`: Return TensorFlow `tf.constant` objects.
                 - `'pt'`: Return PyTorch `torch.Tensor` objects.
                 - `'np'`: Return NumPy `np.ndarray` objects.
                 - `'jax'`: Return JAX `jnp.ndarray` objects.
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
             - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
             - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
               `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
             split_input_ids = tokenized_parts["input_ids"]
             input_ids: List[int] = []
+            audio_token_replacement_token_id = self.vocab[self.audio_token_replacement]
             for i, token_len in enumerate(data.get("audio_token_len", [])):
                 if not audio_is_continuation[i]:
                 audio_token_start_idx.append(len(input_ids))
+                input_ids.extend([audio_token_replacement_token_id] * token_len)
             # Include any tokens after the last audio.
             placeholder_index += 1