Upload ultravox_processing.py with huggingface_hub
Browse files- ultravox_processing.py +8 -24
ultravox_processing.py
CHANGED
|
@@ -40,12 +40,12 @@ class DataCollatorForSeq2SeqWithAudio(transformers.DataCollatorForSeq2Seq):
|
|
| 40 |
batch["alt_attention_mask"] = alt_batch["attention_mask"]
|
| 41 |
batch["alt_labels"] = alt_batch["labels"]
|
| 42 |
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
max_len = max([x.shape[-1] for x in audio_values])
|
| 50 |
batch["audio_values"] = torch.stack(
|
| 51 |
[F.pad(x, (0, max_len - x.shape[-1])) for x in audio_values]
|
|
@@ -67,7 +67,6 @@ class DataCollatorForSeq2SeqWithAudio(transformers.DataCollatorForSeq2Seq):
|
|
| 67 |
class UltravoxProcessor(transformers.ProcessorMixin):
|
| 68 |
"""
|
| 69 |
Constructs an Ultravox processor which wraps an audio processor and a tokenizer into a single processor.
|
| 70 |
-
|
| 71 |
Args:
|
| 72 |
audio_processor: The audio processor for the audio encoder.
|
| 73 |
tokenizer: The tokenizer for the language model.
|
|
@@ -113,18 +112,10 @@ class UltravoxProcessor(transformers.ProcessorMixin):
|
|
| 113 |
tokenizer.eos_token is not None
|
| 114 |
), "The tokenizer has no EOS token. Cannot recover."
|
| 115 |
self.vocab = tokenizer.get_vocab()
|
| 116 |
-
# VLLM currently relies on updating audio_token_replacement, hence to be safe
|
| 117 |
-
# we should not update it. This dependency should be removed in the future.
|
| 118 |
self.audio_token_replacement = tokenizer.eos_token
|
| 119 |
if tokenizer.pad_token_id is None:
|
| 120 |
tokenizer.pad_token_id = tokenizer.eos_token_id
|
| 121 |
|
| 122 |
-
# Use a dummy audio processor to satisfy the base class for text-only training
|
| 123 |
-
if audio_processor is None:
|
| 124 |
-
audio_processor = transformers.AutoProcessor.from_pretrained(
|
| 125 |
-
"openai/whisper-tiny"
|
| 126 |
-
)
|
| 127 |
-
|
| 128 |
super().__init__(audio_processor=audio_processor, tokenizer=tokenizer)
|
| 129 |
|
| 130 |
@classmethod
|
|
@@ -159,18 +150,15 @@ class UltravoxProcessor(transformers.ProcessorMixin):
|
|
| 159 |
"""
|
| 160 |
Processes the audio batch by chunking any items in the batch according to the audio_context_size,
|
| 161 |
padding the last chunk if needed, and returns a dictionary with updated audio data.
|
| 162 |
-
|
| 163 |
Args:
|
| 164 |
audio_values (torch.Tensor): A tensor of audio values (e.g., in B, D, T format).
|
| 165 |
audio_lens (torch.Tensor): A tensor of audio lengths.
|
| 166 |
-
|
| 167 |
Returns:
|
| 168 |
Dict[str, Any]: Dictionary with the following keys:
|
| 169 |
- "audio_values": The concatenated audio tensor after chunking and padding.
|
| 170 |
- "audio_lens": Tensor of lengths for each chunk.
|
| 171 |
- "audio_is_continuation": Tensor of booleans indicating if the chunk is a continuation of the previous chunk.
|
| 172 |
- "audio_batch_size": A Tensor with one integer representing the number of chunks.
|
| 173 |
-
|
| 174 |
"""
|
| 175 |
chunked_audio_values: List[torch.Tensor] = []
|
| 176 |
chunked_audio_lens: List[int] = []
|
|
@@ -236,7 +224,6 @@ class UltravoxProcessor(transformers.ProcessorMixin):
|
|
| 236 |
the text. To prepare the audio(s), this method forwards the `audio`, `sampling_rate` and `kwargs` arguments to
|
| 237 |
audio processor's [`~WhisperProcessor.__call__`] if `audio` is not `None`. Please refer to the docstring
|
| 238 |
of the above two methods for more information.
|
| 239 |
-
|
| 240 |
Args:
|
| 241 |
text (`str`, `List[str]`):
|
| 242 |
The sequence to be encoded. Sequence can be a string or (pretokenized string).
|
|
@@ -249,15 +236,12 @@ class UltravoxProcessor(transformers.ProcessorMixin):
|
|
| 249 |
you are doing.
|
| 250 |
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
| 251 |
If set, will return tensors of a particular framework. Acceptable values are:
|
| 252 |
-
|
| 253 |
- `'tf'`: Return TensorFlow `tf.constant` objects.
|
| 254 |
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
| 255 |
- `'np'`: Return NumPy `np.ndarray` objects.
|
| 256 |
- `'jax'`: Return JAX `jnp.ndarray` objects.
|
| 257 |
-
|
| 258 |
Returns:
|
| 259 |
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
|
| 260 |
-
|
| 261 |
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
|
| 262 |
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
|
| 263 |
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
|
|
@@ -335,7 +319,7 @@ class UltravoxProcessor(transformers.ProcessorMixin):
|
|
| 335 |
split_input_ids = tokenized_parts["input_ids"]
|
| 336 |
input_ids: List[int] = []
|
| 337 |
|
| 338 |
-
|
| 339 |
|
| 340 |
for i, token_len in enumerate(data.get("audio_token_len", [])):
|
| 341 |
if not audio_is_continuation[i]:
|
|
@@ -349,7 +333,7 @@ class UltravoxProcessor(transformers.ProcessorMixin):
|
|
| 349 |
|
| 350 |
audio_token_start_idx.append(len(input_ids))
|
| 351 |
|
| 352 |
-
input_ids.extend([
|
| 353 |
|
| 354 |
# Include any tokens after the last audio.
|
| 355 |
placeholder_index += 1
|
|
|
|
| 40 |
batch["alt_attention_mask"] = alt_batch["attention_mask"]
|
| 41 |
batch["alt_labels"] = alt_batch["labels"]
|
| 42 |
|
| 43 |
+
batch["audio_token_start_idx"] = torch.stack(audio_token_start_idx)
|
| 44 |
+
batch["audio_lens"] = torch.stack(audio_lens)
|
| 45 |
+
batch["audio_token_len"] = torch.stack(audio_token_len)
|
| 46 |
+
|
| 47 |
+
# Pad the last dimension of all audio_values to the same length, with 0s on the right.
|
| 48 |
+
if audio_values:
|
| 49 |
max_len = max([x.shape[-1] for x in audio_values])
|
| 50 |
batch["audio_values"] = torch.stack(
|
| 51 |
[F.pad(x, (0, max_len - x.shape[-1])) for x in audio_values]
|
|
|
|
| 67 |
class UltravoxProcessor(transformers.ProcessorMixin):
|
| 68 |
"""
|
| 69 |
Constructs an Ultravox processor which wraps an audio processor and a tokenizer into a single processor.
|
|
|
|
| 70 |
Args:
|
| 71 |
audio_processor: The audio processor for the audio encoder.
|
| 72 |
tokenizer: The tokenizer for the language model.
|
|
|
|
| 112 |
tokenizer.eos_token is not None
|
| 113 |
), "The tokenizer has no EOS token. Cannot recover."
|
| 114 |
self.vocab = tokenizer.get_vocab()
|
|
|
|
|
|
|
| 115 |
self.audio_token_replacement = tokenizer.eos_token
|
| 116 |
if tokenizer.pad_token_id is None:
|
| 117 |
tokenizer.pad_token_id = tokenizer.eos_token_id
|
| 118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
super().__init__(audio_processor=audio_processor, tokenizer=tokenizer)
|
| 120 |
|
| 121 |
@classmethod
|
|
|
|
| 150 |
"""
|
| 151 |
Processes the audio batch by chunking any items in the batch according to the audio_context_size,
|
| 152 |
padding the last chunk if needed, and returns a dictionary with updated audio data.
|
|
|
|
| 153 |
Args:
|
| 154 |
audio_values (torch.Tensor): A tensor of audio values (e.g., in B, D, T format).
|
| 155 |
audio_lens (torch.Tensor): A tensor of audio lengths.
|
|
|
|
| 156 |
Returns:
|
| 157 |
Dict[str, Any]: Dictionary with the following keys:
|
| 158 |
- "audio_values": The concatenated audio tensor after chunking and padding.
|
| 159 |
- "audio_lens": Tensor of lengths for each chunk.
|
| 160 |
- "audio_is_continuation": Tensor of booleans indicating if the chunk is a continuation of the previous chunk.
|
| 161 |
- "audio_batch_size": A Tensor with one integer representing the number of chunks.
|
|
|
|
| 162 |
"""
|
| 163 |
chunked_audio_values: List[torch.Tensor] = []
|
| 164 |
chunked_audio_lens: List[int] = []
|
|
|
|
| 224 |
the text. To prepare the audio(s), this method forwards the `audio`, `sampling_rate` and `kwargs` arguments to
|
| 225 |
audio processor's [`~WhisperProcessor.__call__`] if `audio` is not `None`. Please refer to the docstring
|
| 226 |
of the above two methods for more information.
|
|
|
|
| 227 |
Args:
|
| 228 |
text (`str`, `List[str]`):
|
| 229 |
The sequence to be encoded. Sequence can be a string or (pretokenized string).
|
|
|
|
| 236 |
you are doing.
|
| 237 |
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
| 238 |
If set, will return tensors of a particular framework. Acceptable values are:
|
|
|
|
| 239 |
- `'tf'`: Return TensorFlow `tf.constant` objects.
|
| 240 |
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
| 241 |
- `'np'`: Return NumPy `np.ndarray` objects.
|
| 242 |
- `'jax'`: Return JAX `jnp.ndarray` objects.
|
|
|
|
| 243 |
Returns:
|
| 244 |
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
|
|
|
|
| 245 |
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
|
| 246 |
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
|
| 247 |
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
|
|
|
|
| 319 |
split_input_ids = tokenized_parts["input_ids"]
|
| 320 |
input_ids: List[int] = []
|
| 321 |
|
| 322 |
+
audio_token_replacement_token_id = self.vocab[self.audio_token_replacement]
|
| 323 |
|
| 324 |
for i, token_len in enumerate(data.get("audio_token_len", [])):
|
| 325 |
if not audio_is_continuation[i]:
|
|
|
|
| 333 |
|
| 334 |
audio_token_start_idx.append(len(input_ids))
|
| 335 |
|
| 336 |
+
input_ids.extend([audio_token_replacement_token_id] * token_len)
|
| 337 |
|
| 338 |
# Include any tokens after the last audio.
|
| 339 |
placeholder_index += 1
|