AtAndDev commited on
Commit
748e8e5
·
verified ·
1 Parent(s): 7fc118b

Upload ultravox_processing.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. ultravox_processing.py +8 -24
ultravox_processing.py CHANGED
@@ -40,12 +40,12 @@ class DataCollatorForSeq2SeqWithAudio(transformers.DataCollatorForSeq2Seq):
40
  batch["alt_attention_mask"] = alt_batch["attention_mask"]
41
  batch["alt_labels"] = alt_batch["labels"]
42
 
43
- # Only process audio fields if we have non-empty audio values
44
- if audio_values and len(audio_values) > 0 and len(audio_values[0]) > 0:
45
- batch["audio_token_start_idx"] = torch.stack(audio_token_start_idx)
46
- batch["audio_lens"] = torch.stack(audio_lens)
47
- batch["audio_token_len"] = torch.stack(audio_token_len)
48
- # Pad the last dimension of all audio_values to the same length, with 0s on the right.
49
  max_len = max([x.shape[-1] for x in audio_values])
50
  batch["audio_values"] = torch.stack(
51
  [F.pad(x, (0, max_len - x.shape[-1])) for x in audio_values]
@@ -67,7 +67,6 @@ class DataCollatorForSeq2SeqWithAudio(transformers.DataCollatorForSeq2Seq):
67
  class UltravoxProcessor(transformers.ProcessorMixin):
68
  """
69
  Constructs an Ultravox processor which wraps an audio processor and a tokenizer into a single processor.
70
-
71
  Args:
72
  audio_processor: The audio processor for the audio encoder.
73
  tokenizer: The tokenizer for the language model.
@@ -113,18 +112,10 @@ class UltravoxProcessor(transformers.ProcessorMixin):
113
  tokenizer.eos_token is not None
114
  ), "The tokenizer has no EOS token. Cannot recover."
115
  self.vocab = tokenizer.get_vocab()
116
- # VLLM currently relies on updating audio_token_replacement, hence to be safe
117
- # we should not update it. This dependency should be removed in the future.
118
  self.audio_token_replacement = tokenizer.eos_token
119
  if tokenizer.pad_token_id is None:
120
  tokenizer.pad_token_id = tokenizer.eos_token_id
121
 
122
- # Use a dummy audio processor to satisfy the base class for text-only training
123
- if audio_processor is None:
124
- audio_processor = transformers.AutoProcessor.from_pretrained(
125
- "openai/whisper-tiny"
126
- )
127
-
128
  super().__init__(audio_processor=audio_processor, tokenizer=tokenizer)
129
 
130
  @classmethod
@@ -159,18 +150,15 @@ class UltravoxProcessor(transformers.ProcessorMixin):
159
  """
160
  Processes the audio batch by chunking any items in the batch according to the audio_context_size,
161
  padding the last chunk if needed, and returns a dictionary with updated audio data.
162
-
163
  Args:
164
  audio_values (torch.Tensor): A tensor of audio values (e.g., in B, D, T format).
165
  audio_lens (torch.Tensor): A tensor of audio lengths.
166
-
167
  Returns:
168
  Dict[str, Any]: Dictionary with the following keys:
169
  - "audio_values": The concatenated audio tensor after chunking and padding.
170
  - "audio_lens": Tensor of lengths for each chunk.
171
  - "audio_is_continuation": Tensor of booleans indicating if the chunk is a continuation of the previous chunk.
172
  - "audio_batch_size": A Tensor with one integer representing the number of chunks.
173
-
174
  """
175
  chunked_audio_values: List[torch.Tensor] = []
176
  chunked_audio_lens: List[int] = []
@@ -236,7 +224,6 @@ class UltravoxProcessor(transformers.ProcessorMixin):
236
  the text. To prepare the audio(s), this method forwards the `audio`, `sampling_rate` and `kwargs` arguments to
237
  audio processor's [`~WhisperProcessor.__call__`] if `audio` is not `None`. Please refer to the docstring
238
  of the above two methods for more information.
239
-
240
  Args:
241
  text (`str`, `List[str]`):
242
  The sequence to be encoded. Sequence can be a string or (pretokenized string).
@@ -249,15 +236,12 @@ class UltravoxProcessor(transformers.ProcessorMixin):
249
  you are doing.
250
  return_tensors (`str` or [`~utils.TensorType`], *optional*):
251
  If set, will return tensors of a particular framework. Acceptable values are:
252
-
253
  - `'tf'`: Return TensorFlow `tf.constant` objects.
254
  - `'pt'`: Return PyTorch `torch.Tensor` objects.
255
  - `'np'`: Return NumPy `np.ndarray` objects.
256
  - `'jax'`: Return JAX `jnp.ndarray` objects.
257
-
258
  Returns:
259
  [`BatchFeature`]: A [`BatchFeature`] with the following fields:
260
-
261
  - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
262
  - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
263
  `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
@@ -335,7 +319,7 @@ class UltravoxProcessor(transformers.ProcessorMixin):
335
  split_input_ids = tokenized_parts["input_ids"]
336
  input_ids: List[int] = []
337
 
338
- audio_replacement_token_id = self.vocab[self.audio_token_replacement]
339
 
340
  for i, token_len in enumerate(data.get("audio_token_len", [])):
341
  if not audio_is_continuation[i]:
@@ -349,7 +333,7 @@ class UltravoxProcessor(transformers.ProcessorMixin):
349
 
350
  audio_token_start_idx.append(len(input_ids))
351
 
352
- input_ids.extend([audio_replacement_token_id] * token_len)
353
 
354
  # Include any tokens after the last audio.
355
  placeholder_index += 1
 
40
  batch["alt_attention_mask"] = alt_batch["attention_mask"]
41
  batch["alt_labels"] = alt_batch["labels"]
42
 
43
+ batch["audio_token_start_idx"] = torch.stack(audio_token_start_idx)
44
+ batch["audio_lens"] = torch.stack(audio_lens)
45
+ batch["audio_token_len"] = torch.stack(audio_token_len)
46
+
47
+ # Pad the last dimension of all audio_values to the same length, with 0s on the right.
48
+ if audio_values:
49
  max_len = max([x.shape[-1] for x in audio_values])
50
  batch["audio_values"] = torch.stack(
51
  [F.pad(x, (0, max_len - x.shape[-1])) for x in audio_values]
 
67
  class UltravoxProcessor(transformers.ProcessorMixin):
68
  """
69
  Constructs an Ultravox processor which wraps an audio processor and a tokenizer into a single processor.
 
70
  Args:
71
  audio_processor: The audio processor for the audio encoder.
72
  tokenizer: The tokenizer for the language model.
 
112
  tokenizer.eos_token is not None
113
  ), "The tokenizer has no EOS token. Cannot recover."
114
  self.vocab = tokenizer.get_vocab()
 
 
115
  self.audio_token_replacement = tokenizer.eos_token
116
  if tokenizer.pad_token_id is None:
117
  tokenizer.pad_token_id = tokenizer.eos_token_id
118
 
 
 
 
 
 
 
119
  super().__init__(audio_processor=audio_processor, tokenizer=tokenizer)
120
 
121
  @classmethod
 
150
  """
151
  Processes the audio batch by chunking any items in the batch according to the audio_context_size,
152
  padding the last chunk if needed, and returns a dictionary with updated audio data.
 
153
  Args:
154
  audio_values (torch.Tensor): A tensor of audio values (e.g., in B, D, T format).
155
  audio_lens (torch.Tensor): A tensor of audio lengths.
 
156
  Returns:
157
  Dict[str, Any]: Dictionary with the following keys:
158
  - "audio_values": The concatenated audio tensor after chunking and padding.
159
  - "audio_lens": Tensor of lengths for each chunk.
160
  - "audio_is_continuation": Tensor of booleans indicating if the chunk is a continuation of the previous chunk.
161
  - "audio_batch_size": A Tensor with one integer representing the number of chunks.
 
162
  """
163
  chunked_audio_values: List[torch.Tensor] = []
164
  chunked_audio_lens: List[int] = []
 
224
  the text. To prepare the audio(s), this method forwards the `audio`, `sampling_rate` and `kwargs` arguments to
225
  audio processor's [`~WhisperProcessor.__call__`] if `audio` is not `None`. Please refer to the docstring
226
  of the above two methods for more information.
 
227
  Args:
228
  text (`str`, `List[str]`):
229
  The sequence to be encoded. Sequence can be a string or (pretokenized string).
 
236
  you are doing.
237
  return_tensors (`str` or [`~utils.TensorType`], *optional*):
238
  If set, will return tensors of a particular framework. Acceptable values are:
 
239
  - `'tf'`: Return TensorFlow `tf.constant` objects.
240
  - `'pt'`: Return PyTorch `torch.Tensor` objects.
241
  - `'np'`: Return NumPy `np.ndarray` objects.
242
  - `'jax'`: Return JAX `jnp.ndarray` objects.
 
243
  Returns:
244
  [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
245
  - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
246
  - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
247
  `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
 
319
  split_input_ids = tokenized_parts["input_ids"]
320
  input_ids: List[int] = []
321
 
322
+ audio_token_replacement_token_id = self.vocab[self.audio_token_replacement]
323
 
324
  for i, token_len in enumerate(data.get("audio_token_len", [])):
325
  if not audio_is_continuation[i]:
 
333
 
334
  audio_token_start_idx.append(len(input_ids))
335
 
336
+ input_ids.extend([audio_token_replacement_token_id] * token_len)
337
 
338
  # Include any tokens after the last audio.
339
  placeholder_index += 1