Upload 5 files

Browse files

Files changed (4) hide show

processor_config.json +1 -1
ultravox_config.py +19 -7
ultravox_model.py +194 -61
ultravox_processing.py +232 -63

processor_config.json CHANGED Viewed

@@ -5,7 +5,7 @@
   "auto_map": {
     "AutoProcessor": "ultravox_processing.UltravoxProcessor"
   },
-  "encoder_ds_factor": 320,
   "processor_class": "UltravoxProcessor",
   "stack_factor": 8
 }

   "auto_map": {
     "AutoProcessor": "ultravox_processing.UltravoxProcessor"
   },
+  "encoder_ds_factor": 2,
   "processor_class": "UltravoxProcessor",
   "stack_factor": 8
 }

ultravox_config.py CHANGED Viewed

@@ -32,6 +32,8 @@ class LossFunction(str, Enum):
 class LossConfig:
     loss_function: LossFunction = LossFunction.CrossEntropy
     kl_temperature: float = 2.0
     @property
     def requires_alt_fields(self):
@@ -47,7 +49,7 @@ class UltravoxConfig(transformers.PretrainedConfig):
     documentation from [`PretrainedConfig`] for more information.
     Args:
-        audio_config (`Wav2Vec2Config`,  *optional*):
             Custom audio config or dict
         text_config (`Union[AutoConfig, dict]`, *optional*):
             The config object of the text backbone. Can be any of `LlamaConfig` or `MistralConfig`.
@@ -65,15 +67,17 @@ class UltravoxConfig(transformers.PretrainedConfig):
             The LoRA configuration for finetuning the text model.
         audio_model_lora_config (`LoraConfigSimplified`, *optional*):
             The LoRA configuration for finetuning the audio model.
     Example:
     ```python
-    >>> from transformers import UltravoxForConditionalGeneration, Wav2Vec2Config, UltravoxConfig, LlamaConfig
     >>> # Initializing an audio encoder config
-    >>> audio_config = Wav2Vec2Config()
     >>> # Initializing a Llama config
     >>> text_config = LlamaConfig()
@@ -82,13 +86,13 @@ class UltravoxConfig(transformers.PretrainedConfig):
     >>> configuration = UltravoxConfig(audio_config, text_config)
     >>> # Initializing a completely untrained model from the configuration
-    >>> model = UltravoxForConditionalGeneration(configuration)
     >>> # Accessing the model configuration
     >>> configuration = model.config
     >>> # Initialize a model from pretrained checkpoints and random projector weights
-    >>> config = UltravoxConfig(audio_model_id="facebook/wav2vec2-base-960h", text_model_id="meta-llama/Llama-2-7b-chat-hf")
     ```"""
     model_type = "ultravox"
@@ -105,8 +109,10 @@ class UltravoxConfig(transformers.PretrainedConfig):
         stack_factor: int = 8,
         norm_init: float = 0.4,
         projector_act: str = "swiglu",
         text_model_lora_config: Optional[LoraConfigSimplified] = None,
         audio_model_lora_config: Optional[LoraConfigSimplified] = None,
         **kwargs,
     ):
         self.ignore_index = ignore_index
@@ -118,7 +124,7 @@ class UltravoxConfig(transformers.PretrainedConfig):
         self.stack_factor = stack_factor
         self.norm_init = norm_init
         self.projector_act = projector_act
         if text_model_id is not None:
             self.text_config: transformers.LlamaConfig = (
                 transformers.AutoConfig.from_pretrained(text_model_id)
@@ -136,7 +142,7 @@ class UltravoxConfig(transformers.PretrainedConfig):
         else:
             audio_config = audio_config or {}
             self.audio_config = transformers.CONFIG_MAPPING[
-                audio_config.get("model_type", "wav2vec2")
             ](**audio_config)
         self.text_model_lora_config = (
@@ -149,6 +155,7 @@ class UltravoxConfig(transformers.PretrainedConfig):
             if isinstance(audio_model_lora_config, dict)
             else dataclasses.asdict(audio_model_lora_config or LoraConfigSimplified())
         )
         self.vocab_size = self.text_config.vocab_size
@@ -162,7 +169,12 @@ class UltravoxConfig(transformers.PretrainedConfig):
         # remove text_config and audio_config if text_model_id and audio_model_id are present
         if self.text_model_id is not None:
             diff_dict.pop("text_config", None)
         if self.audio_model_id is not None:
             diff_dict.pop("audio_config", None)
         return diff_dict

 class LossConfig:
     loss_function: LossFunction = LossFunction.CrossEntropy
     kl_temperature: float = 2.0
+    # Number of tokens to ignore from the beginning of the sequence. Only used in LSM
+    initial_tokens_to_ignore: int = 0
     @property
     def requires_alt_fields(self):
     documentation from [`PretrainedConfig`] for more information.
     Args:
+        audio_config (`WhisperConfig`,  *optional*):
             Custom audio config or dict
         text_config (`Union[AutoConfig, dict]`, *optional*):
             The config object of the text backbone. Can be any of `LlamaConfig` or `MistralConfig`.
             The LoRA configuration for finetuning the text model.
         audio_model_lora_config (`LoraConfigSimplified`, *optional*):
             The LoRA configuration for finetuning the audio model.
+        audio_latency_block_size (`int`, *optional*, defaults to `None`):
+            The latency block size for simulating audio streaming.
     Example:
     ```python
+    >>> from transformers import UltravoxModel, WhisperConfig, UltravoxConfig, LlamaConfig
     >>> # Initializing an audio encoder config
+    >>> audio_config = WhisperConfig()
     >>> # Initializing a Llama config
     >>> text_config = LlamaConfig()
     >>> configuration = UltravoxConfig(audio_config, text_config)
     >>> # Initializing a completely untrained model from the configuration
+    >>> model = UltravoxModel(configuration)
     >>> # Accessing the model configuration
     >>> configuration = model.config
     >>> # Initialize a model from pretrained checkpoints and random projector weights
+    >>> config = UltravoxConfig(audio_model_id="openai/whisper-tiny", text_model_id="meta-llama/Llama-2-7b-chat-hf")
     ```"""
     model_type = "ultravox"
         stack_factor: int = 8,
         norm_init: float = 0.4,
         projector_act: str = "swiglu",
+        projector_ln_mid: bool = False,  # defaults to False for compatibility with v0.4.1 and below
         text_model_lora_config: Optional[LoraConfigSimplified] = None,
         audio_model_lora_config: Optional[LoraConfigSimplified] = None,
+        audio_latency_block_size: Optional[int] = None,
         **kwargs,
     ):
         self.ignore_index = ignore_index
         self.stack_factor = stack_factor
         self.norm_init = norm_init
         self.projector_act = projector_act
+        self.projector_ln_mid = projector_ln_mid
         if text_model_id is not None:
             self.text_config: transformers.LlamaConfig = (
                 transformers.AutoConfig.from_pretrained(text_model_id)
         else:
             audio_config = audio_config or {}
             self.audio_config = transformers.CONFIG_MAPPING[
+                audio_config.get("model_type", "whisper")
             ](**audio_config)
         self.text_model_lora_config = (
             if isinstance(audio_model_lora_config, dict)
             else dataclasses.asdict(audio_model_lora_config or LoraConfigSimplified())
         )
+        self.audio_latency_block_size = audio_latency_block_size
         self.vocab_size = self.text_config.vocab_size
         # remove text_config and audio_config if text_model_id and audio_model_id are present
         if self.text_model_id is not None:
             diff_dict.pop("text_config", None)
+        elif "text_config" in diff_dict:
+            diff_dict["text_config"].pop("_attn_implementation_autoset", None)
         if self.audio_model_id is not None:
             diff_dict.pop("audio_config", None)
+        elif "audio_config" in diff_dict:
+            diff_dict["audio_config"].pop("_attn_implementation_autoset", None)
         return diff_dict

ultravox_model.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import logging
 import re
-from typing import Any, Dict, Optional, Set, Tuple, Union
 import peft
 import torch
@@ -10,6 +10,7 @@ import transformers
 import transformers.activations
 import transformers.modeling_outputs
 import transformers.models
 from transformers.models.whisper import modeling_whisper as whisper
 # We must use relative import in this directory to allow uploading to HF Hub
@@ -19,7 +20,7 @@ from .ultravox_config import LossFunction
 from .ultravox_config import UltravoxConfig
-class UltravoxModel(transformers.LlamaPreTrainedModel):
     """
     The Ultravox model which consists of an audio encoder and a language model.
@@ -35,14 +36,11 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
     config_class = UltravoxConfig
     config: UltravoxConfig  # for type hinting
-    # We minimize the weights in state_dict in order to reduce the size of the checkpoint
-    # The issue is that load_pretrained() uses state_dict() keys to know what keys are expected
-    # As such we have to tell is to ignore some keys that are not always in the model
-    _keys_to_ignore_on_load_unexpected = ["audio_tower.*", "language_model.*"]
-    # Usually we load encoder weights from a pretrained model, so we don't want to load the decoder weights
-    # Technically we never hit this issue because these keys are already removed from state_dict() however,
-    # but there's no harm in keeping it here for when we change that behavior.
-    _keys_to_ignore_on_load_missing = ["audio_tower.*"]
     def __init__(self, config: UltravoxConfig):
         super().__init__(config)
@@ -52,15 +50,16 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
         self.vocab_size = config.vocab_size
         self.audio_tower = self._create_audio_tower(config)
         self.multi_modal_projector = self._create_multi_modal_projector(config)
         self.language_model = self._create_language_model(config)
         # Determine no_split_modules dynamically to use with FSDP auto_wrap policy.
         # FSDP throws an error if some of the layer types are not found in the model.
-        # This would be something like ["LlamaDecoderLayer", "WhisperEncoderLayer"]
-        self._no_split_modules = (self.language_model._no_split_modules or []) + (
-            self.audio_tower._no_split_modules or []
-        )
         self.loss_config = LossConfig()
         self.post_init()
@@ -147,6 +146,24 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
         )
         return {"loss": kl_loss}
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -155,7 +172,9 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
         labels: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         audio_token_start_idx: Optional[torch.Tensor] = None,
         audio_token_len: Optional[torch.Tensor] = None,
         past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None,
         # the alt_* fields are needed for KL divergence loss
         alt_input_ids: Optional[torch.Tensor] = None,
@@ -186,28 +205,37 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
             # B x T  ->  B x T x D
             inputs_embeds = self.get_input_embeddings().forward(input_ids)
-        if audio_values is not None:
             assert (
-                audio_token_start_idx is not None and audio_token_len is not None
-            ), "audio_token_start_idx and audio_token_len must be provided if audio_values are provided."
             assert (
-                len(audio_token_start_idx) == len(audio_token_len) == len(audio_values)
-            ), "audio_token_start_idx, audio_token_len, and audio_values must have the same batch size."
-            # B x A/3200 x D
             audio_tower_output = self.audio_tower.forward(
-                audio_values.to(self.audio_tower.dtype)
             ).last_hidden_state
             audio_tower_output = audio_tower_output.to(inputs_embeds.dtype)
             audio_embeds = self.multi_modal_projector.forward(audio_tower_output)
             # combine audio and text embeddings
-            for i, (audio, start, length) in enumerate(
-                zip(audio_embeds, audio_token_start_idx, audio_token_len)
-            ):
-                length = min(length, audio.shape[0])
-                inputs_embeds[i, start : start + length] = audio[:length]
         lm_output = self.language_model.forward(
             inputs_embeds=inputs_embeds,
@@ -242,6 +270,8 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
         audio_values: Optional[torch.FloatTensor] = None,
         audio_token_start_idx: Optional[torch.Tensor] = None,
         audio_token_len: Optional[torch.Tensor] = None,
         past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None,
         attention_mask: Optional[torch.Tensor] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
@@ -270,6 +300,8 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
                 audio_token_start_idx - prefill_start_idx
             )
             model_input["audio_token_len"] = audio_token_len
         return model_input
@@ -286,18 +318,32 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
         cls, config: UltravoxConfig
     ) -> Union[transformers.Wav2Vec2Model, "ModifiedWhisperEncoder"]:
         if config.audio_model_id is not None:
-            if "whisper" in config.audio_model_id is not None:
                 audio_tower = ModifiedWhisperEncoder.from_pretrained(
                     config.audio_model_id, torch_dtype=config.torch_dtype
                 )
             else:
                 audio_tower = transformers.AutoModel.from_pretrained(
                     config.audio_model_id, torch_dtype=config.torch_dtype
                 )
         else:
-            if "whisper" in config.audio_config._name_or_path:
                 audio_tower = ModifiedWhisperEncoder(config.audio_config)
             else:
                 with transformers.modeling_utils.no_init_weights():
                     # we only ever use from_config if the weights are retrained, hence initializing is not
                     # required. This makes the model quite creation faster since init on CPU is quite slow.
@@ -375,21 +421,32 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
         self.merge_and_unload()
         return super().push_to_hub(*args, **kwargs)
-    def save_pretrained(
-        self, *args, state_dict: Optional[Dict[str, Any]] = None, **kwargs
-    ):
         if state_dict is None:
             state_dict = super().state_dict()
-        named_params = dict(self.named_parameters())
         state_dict = {
             k: v
             for k, v in state_dict.items()
-            if k in self.keep_params
-            or (k in named_params and named_params[k].requires_grad)
         }
         super().save_pretrained(*args, state_dict=state_dict, **kwargs)
     def _pre_load_state_dict_hook(self, state_dict: Dict[str, Any], *args, **kwargs):
@@ -426,7 +483,7 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
 # TODO: refactor common parts to a shared module
 def is_cache_empty(
-    past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]]
 ) -> bool:
     """
     Check if the cache is empty.
@@ -462,12 +519,8 @@ def apply_lora(model: torch.nn.Module, lora_config: dict) -> torch.nn.Module:
 class StackAudioFrames(nn.Module):
     """
-    Stack the audio embedding frames to reduce the sequence length by a factor of `stack_factor`.
-    The number of output frames will be `ceil(T / stack_factor) + 1` where `T` is the number of input frames.
-    NOTE: the extra +1 is intentional: in case the number of audio tokens are over-estimated by the processor,
-    we want to make sure `processor.audio_token_replacement` (i.e. EOS) doesn't get leaked into the middle of embeddings.
-    In most cases this extra padding will get removed in the model's forward function so it has no effect.
     """
     def __init__(self, stack_factor: int = 8):
@@ -477,7 +530,7 @@ class StackAudioFrames(nn.Module):
     def forward(self, audio_embeds: torch.Tensor) -> torch.Tensor:
         B, T, C = audio_embeds.shape
         T_pad = (T + self.stack_factor - 1) // self.stack_factor * self.stack_factor
-        audio_embeds = F.pad(audio_embeds, (0, 0, 0, T_pad - T + self.stack_factor))
         B, T, C = audio_embeds.shape
         audio_embeds = audio_embeds.view(
             B, T // self.stack_factor, C * self.stack_factor
@@ -497,31 +550,43 @@ class SwiGLU(nn.Module):
         return F.silu(gate) * x
-class UltravoxProjector(nn.Sequential):
     def __init__(self, config: UltravoxConfig):
         super().__init__()
         self.hidden_dim = config.hidden_size
         self._pad_and_stack = StackAudioFrames(config.stack_factor)
-        dim = config.audio_config.hidden_size * config.stack_factor
-        self.ln_pre = RMSNorm(dim, init=config.norm_init)
-        self.linear_1 = nn.Linear(dim, self.hidden_dim, bias=False)
-        dim = self.hidden_dim
         self.act = transformers.activations.get_activation(config.projector_act)
-        dim = dim // 2 if config.projector_act == "swiglu" else dim
-        self.linear_2 = nn.Linear(dim, config.text_config.hidden_size, bias=False)
-        self.ln_post = RMSNorm(config.text_config.hidden_size, init=config.norm_init)
     def forward(self, audio_features: torch.Tensor) -> torch.Tensor:
         audio_features = self._pad_and_stack(audio_features)
         audio_features = self.ln_pre(audio_features)
         hidden_states = self.linear_1(audio_features)
         hidden_states = self.act(hidden_states)
         hidden_states = self.linear_2(hidden_states)
         hidden_states = self.ln_post(hidden_states)
         return hidden_states
-class ModifiedWhisperEncoder(whisper.WhisperEncoder):
     """
     Encoder portion of OpenAI's Whisper model.
@@ -537,20 +602,57 @@ class ModifiedWhisperEncoder(whisper.WhisperEncoder):
     base_model_prefix = "model.encoder"
     _no_split_modules = ["WhisperEncoderLayer"]
     def forward(
         self,
         input_features,
-        attention_mask=None,
         head_mask=None,
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
     ):
-        expected_seq_length = (
-            self.config.max_source_positions
-            * self.conv1.stride[0]
-            * self.conv2.stride[0]
-        )
         if input_features.shape[-1] > expected_seq_length:
             raise ValueError(
                 f"Whisper expects the mel input features to be of length {expected_seq_length} or less, but found {input_features.shape[-1]}. Make sure to pad the input mel features to {expected_seq_length}."
@@ -583,6 +685,37 @@ class ModifiedWhisperEncoder(whisper.WhisperEncoder):
         encoder_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
         # check if head_mask has a correct number of layers specified if desired
         if head_mask is not None:
             assert head_mask.size()[0] == (
@@ -606,14 +739,14 @@ class ModifiedWhisperEncoder(whisper.WhisperEncoder):
                     layer_outputs = self._gradient_checkpointing_func(
                         encoder_layer.__call__,
                         hidden_states,
-                        None,
                         (head_mask[idx] if head_mask is not None else None),
                         output_attentions,
                     )
                 else:
                     layer_outputs = encoder_layer(
                         hidden_states,
-                        None,
                         layer_head_mask=(
                             head_mask[idx] if head_mask is not None else None
                         ),

 import logging
 import re
+from typing import Any, Dict, Generator, Optional, Set, Tuple, Union
 import peft
 import torch
 import transformers.activations
 import transformers.modeling_outputs
 import transformers.models
+from transformers.generation.utils import GenerationMixin
 from transformers.models.whisper import modeling_whisper as whisper
 # We must use relative import in this directory to allow uploading to HF Hub
 from .ultravox_config import UltravoxConfig
+class UltravoxModel(transformers.LlamaPreTrainedModel, GenerationMixin):
     """
     The Ultravox model which consists of an audio encoder and a language model.
     config_class = UltravoxConfig
     config: UltravoxConfig  # for type hinting
+    # Usually we load encoder and LLM weights from a pretrained model separately, so they are allowed to be missing
+    _keys_to_ignore_on_load_missing = ["audio_tower.*", "language_model.*"]
+    # Since we have kwargs in forward, we need to set this to False, otherwise grad_accum_steps will cause incorrect train loss to be reported
+    # see https://github.com/huggingface/transformers/issues/35856 and https://github.com/huggingface/trl/pull/2615/files
+    accepts_loss_kwargs = False
     def __init__(self, config: UltravoxConfig):
         super().__init__(config)
         self.vocab_size = config.vocab_size
         self.audio_tower = self._create_audio_tower(config)
+        self.audio_tower_context_length: Optional[int] = None
+        self.audio_tower_context_length = self.audio_tower.max_context_length
         self.multi_modal_projector = self._create_multi_modal_projector(config)
         self.language_model = self._create_language_model(config)
         # Determine no_split_modules dynamically to use with FSDP auto_wrap policy.
         # FSDP throws an error if some of the layer types are not found in the model.
+        # This would be something like ["LlamaDecoderLayer"] as we don't split audio encoder layers.
+        self._no_split_modules = self.language_model._no_split_modules
         self.loss_config = LossConfig()
         self.post_init()
         )
         return {"loss": kl_loss}
+    def _audio_iter(
+        self, audio_batch_size: torch.Tensor
+    ) -> Generator[Tuple[int, int], None, None]:
+        """
+        Iterate over the audio batch size and yield the batch index and audio index of each audio item.
+        Args:
+            audio_batch_size: A tensor of shape (B,) where B is the batch size.
+        Returns:
+            A generator that yields a tuple of (start index, length) for each audio item.
+        """
+        audio_index = 0
+        for i_b, batch_count in enumerate(audio_batch_size):
+            for _ in range(batch_count):
+                yield i_b, audio_index
+                audio_index += 1
     def forward(
         self,
         input_ids: torch.Tensor,
         labels: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         audio_token_start_idx: Optional[torch.Tensor] = None,
+        audio_lens: Optional[torch.Tensor] = None,
         audio_token_len: Optional[torch.Tensor] = None,
+        audio_batch_size: Optional[torch.Tensor] = None,
         past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None,
         # the alt_* fields are needed for KL divergence loss
         alt_input_ids: Optional[torch.Tensor] = None,
             # B x T  ->  B x T x D
             inputs_embeds = self.get_input_embeddings().forward(input_ids)
+        if audio_values is not None and len(audio_values) > 0:
             assert (
+                audio_token_start_idx is not None
+                and audio_token_len is not None
+                and audio_lens is not None
+                and audio_batch_size is not None
+            ), "audio_token_start_idx/audio_token_len/audio_lens must be provided if audio_values are provided."
             assert (
+                len(audio_token_start_idx)
+                == len(audio_token_len)
+                == len(audio_lens)
+                == len(audio_values)
+            ), "audio_token_start_idx/audio_token_len/audio_lens/audio_values must have the same batch size."
+            assert len(audio_batch_size) == len(
+                inputs_embeds
+            ), "audio_batch_size and inputs_embeds must have the same batch size."
+            # B x A/3200 x (D=max-audio-length-in-batch)
             audio_tower_output = self.audio_tower.forward(
+                audio_values.to(self.audio_tower.dtype),
+                audio_len=audio_lens,
             ).last_hidden_state
             audio_tower_output = audio_tower_output.to(inputs_embeds.dtype)
             audio_embeds = self.multi_modal_projector.forward(audio_tower_output)
             # combine audio and text embeddings
+            for i_b, i_a in self._audio_iter(audio_batch_size):
+                start_idx = audio_token_start_idx[i_a]
+                token_len = audio_token_len[i_a]
+                item_embedding = audio_embeds[i_a][:token_len]
+                inputs_embeds[i_b][start_idx : start_idx + token_len] = item_embedding
         lm_output = self.language_model.forward(
             inputs_embeds=inputs_embeds,
         audio_values: Optional[torch.FloatTensor] = None,
         audio_token_start_idx: Optional[torch.Tensor] = None,
         audio_token_len: Optional[torch.Tensor] = None,
+        audio_lens: Optional[torch.Tensor] = None,
+        audio_batch_size: Optional[torch.Tensor] = None,
         past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None,
         attention_mask: Optional[torch.Tensor] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
                 audio_token_start_idx - prefill_start_idx
             )
             model_input["audio_token_len"] = audio_token_len
+            model_input["audio_batch_size"] = audio_batch_size
+            model_input["audio_lens"] = audio_lens
         return model_input
         cls, config: UltravoxConfig
     ) -> Union[transformers.Wav2Vec2Model, "ModifiedWhisperEncoder"]:
         if config.audio_model_id is not None:
+            if "whisper" in config.audio_model_id.lower():
                 audio_tower = ModifiedWhisperEncoder.from_pretrained(
                     config.audio_model_id, torch_dtype=config.torch_dtype
                 )
+                audio_tower.init_latency_mask(
+                    config.audio_latency_block_size, dtype=config.torch_dtype
+                )
             else:
+                assert config.audio_latency_block_size in (
+                    None,
+                    0,
+                ), "only whisper audio tower supports audio latency masking, got non-zero value for 'audio_latency_block_size'"
                 audio_tower = transformers.AutoModel.from_pretrained(
                     config.audio_model_id, torch_dtype=config.torch_dtype
                 )
         else:
+            if "whisper" in config.audio_config._name_or_path.lower():
                 audio_tower = ModifiedWhisperEncoder(config.audio_config)
+                audio_tower.init_latency_mask(
+                    config.audio_latency_block_size, dtype=config.torch_dtype
+                )
             else:
+                assert config.audio_latency_block_size in (
+                    None,
+                    0,
+                ), "only whisper audio tower supports audio latency masking, got non-zero value for 'audio_latency_block_size'"
                 with transformers.modeling_utils.no_init_weights():
                     # we only ever use from_config if the weights are retrained, hence initializing is not
                     # required. This makes the model quite creation faster since init on CPU is quite slow.
         self.merge_and_unload()
         return super().push_to_hub(*args, **kwargs)
+    def diff_state_dict(
+        self, state_dict: Optional[Dict[str, Any]] = None
+    ) -> Dict[str, Any]:
         if state_dict is None:
             state_dict = super().state_dict()
+        trainable_params = {k for k, v in self.named_parameters() if v.requires_grad}
+        # normalize the keys to match the original model
+        # Example: audio_tower.base_model.model.layers.0._fsdp_wrapped_module.self_attn.k_proj.lora_B.default.weight
+        trainable_params = {
+            k.replace("_fsdp_wrapped_module.", "") for k in trainable_params
+        }
         state_dict = {
             k: v
             for k, v in state_dict.items()
+            if k in self.keep_params or k in trainable_params
         }
+        return state_dict
+    def save_pretrained(
+        self, *args, state_dict: Optional[Dict[str, Any]] = None, **kwargs
+    ):
+        state_dict = self.diff_state_dict(state_dict)
         super().save_pretrained(*args, state_dict=state_dict, **kwargs)
     def _pre_load_state_dict_hook(self, state_dict: Dict[str, Any], *args, **kwargs):
 # TODO: refactor common parts to a shared module
 def is_cache_empty(
+    past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]],
 ) -> bool:
     """
     Check if the cache is empty.
 class StackAudioFrames(nn.Module):
     """
+    Stack the audio embedding frames to reduce the sequence length by a factor
+    of `stack_factor`.
     """
     def __init__(self, stack_factor: int = 8):
     def forward(self, audio_embeds: torch.Tensor) -> torch.Tensor:
         B, T, C = audio_embeds.shape
         T_pad = (T + self.stack_factor - 1) // self.stack_factor * self.stack_factor
+        audio_embeds = F.pad(audio_embeds, (0, 0, 0, T_pad - T))
         B, T, C = audio_embeds.shape
         audio_embeds = audio_embeds.view(
             B, T // self.stack_factor, C * self.stack_factor
         return F.silu(gate) * x
+class UltravoxProjector(nn.Module):
     def __init__(self, config: UltravoxConfig):
         super().__init__()
         self.hidden_dim = config.hidden_size
         self._pad_and_stack = StackAudioFrames(config.stack_factor)
+        dim_in = config.audio_config.hidden_size * config.stack_factor
+        self.ln_pre = RMSNorm(dim_in, init=config.norm_init)
+        self.linear_1 = nn.Linear(dim_in, self.hidden_dim, bias=False)
+        dim_mid = self.hidden_dim
         self.act = transformers.activations.get_activation(config.projector_act)
+        dim_mid = dim_mid // 2 if config.projector_act == "swiglu" else dim_mid
+        dim_out = config.text_config.hidden_size
+        self.linear_2 = nn.Linear(dim_mid, dim_out, bias=False)
+        # Ultravox v0.4.1 and below uses layer_norm after the second linear layer,
+        # while v0.5.0 and above uses layer_norm after the first linear layer.
+        if config.projector_ln_mid:
+            self.ln_mid: nn.Module = RMSNorm(dim_mid, init=config.norm_init)
+            self.ln_post: nn.Module = nn.Identity()
+        else:
+            self.ln_mid = nn.Identity()
+            self.ln_post = RMSNorm(dim_out, init=config.norm_init)
     def forward(self, audio_features: torch.Tensor) -> torch.Tensor:
         audio_features = self._pad_and_stack(audio_features)
         audio_features = self.ln_pre(audio_features)
         hidden_states = self.linear_1(audio_features)
         hidden_states = self.act(hidden_states)
+        hidden_states = self.ln_mid(hidden_states)
         hidden_states = self.linear_2(hidden_states)
         hidden_states = self.ln_post(hidden_states)
         return hidden_states
+class ModifiedWhisperEncoder(
+    whisper.WhisperEncoder, transformers.modeling_utils.ModuleUtilsMixin
+):
     """
     Encoder portion of OpenAI's Whisper model.
     base_model_prefix = "model.encoder"
     _no_split_modules = ["WhisperEncoderLayer"]
+    def __init__(self, config: transformers.WhisperConfig):
+        super().__init__(config)
+        self.config.is_decoder = False
+    @property
+    def max_context_length(self):
+        return (
+            self.config.max_source_positions
+            * self.conv1.stride[0]
+            * self.conv2.stride[0]
+        )
+    def init_latency_mask(self, audio_latency_block_size: int, dtype: torch.dtype):
+        if audio_latency_block_size is None:
+            self.audio_streaming_mask = None
+            return
+        # Use max_context_length directly in the calculation
+        max_seqlen = self.max_context_length
+        assert (
+            max_seqlen > 0
+        ), f"maximum sequence length must be positive, got {max_seqlen}"
+        assert (
+            max_seqlen % audio_latency_block_size == 0
+        ), f"audio_latency_block_size {audio_latency_block_size} must divide {max_seqlen} evenly."
+        # Given the block size, we calculate number of blocks.
+        audio_latency_nblocks = max_seqlen // audio_latency_block_size
+        audio_streaming_mask = (
+            torch.tril(
+                torch.ones(audio_latency_nblocks, audio_latency_nblocks),
+                diagonal=0,
+            )
+            .repeat_interleave(audio_latency_block_size, dim=0)
+            .repeat_interleave(audio_latency_block_size, dim=1)
+        )
+        audio_streaming_mask = (1.0 - audio_streaming_mask) * torch.finfo(dtype).min
+        audio_streaming_mask = audio_streaming_mask[None, None, :, :]
+        self.register_buffer(
+            "audio_streaming_mask", audio_streaming_mask, persistent=False
+        )
     def forward(
         self,
         input_features,
+        audio_len=None,
         head_mask=None,
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
     ):
+        expected_seq_length = self.max_context_length
         if input_features.shape[-1] > expected_seq_length:
             raise ValueError(
                 f"Whisper expects the mel input features to be of length {expected_seq_length} or less, but found {input_features.shape[-1]}. Make sure to pad the input mel features to {expected_seq_length}."
         encoder_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
+        # Create attention mask based on audio lengths to mask out padding tokens
+        # For each sample in batch:
+        # - Convert raw audio length to feature length after convolutions
+        # - Create boolean mask that is True for valid positions and False for padding
+        # - Convert to extended attention mask format expected by transformer layers
+        #   (1.0 for positions to attend to, large negative for positions to ignore)
+        # This masking ensures consistent behavior between training and inference
+        # by preventing the model from attending to padding tokens in both cases
+        attention_mask = None
+        if audio_len != None:
+            audio_feature_len = self._get_feat_extract_output_lengths(audio_len)
+            max_seq_len = hidden_states.shape[1]
+            attention_mask = torch.arange(max_seq_len, device=hidden_states.device)[
+                None, :
+            ].lt(audio_feature_len.view(-1, 1))
+            attention_mask = self.get_extended_attention_mask(
+                attention_mask,
+                None,
+                dtype=hidden_states.dtype,
+            )
+        if self.audio_streaming_mask is not None:
+            seqlen = hidden_states.size(-2)
+            if attention_mask is not None:
+                attention_mask = torch.minimum(
+                    self.audio_streaming_mask[:, :, :seqlen, :seqlen], attention_mask
+                )  # merge
+            else:
+                attention_mask = self.audio_streaming_mask[:, :, :seqlen, :seqlen]
+            attention_mask = attention_mask.to(hidden_states.dtype)
         # check if head_mask has a correct number of layers specified if desired
         if head_mask is not None:
             assert head_mask.size()[0] == (
                     layer_outputs = self._gradient_checkpointing_func(
                         encoder_layer.__call__,
                         hidden_states,
+                        attention_mask,
                         (head_mask[idx] if head_mask is not None else None),
                         output_attentions,
                     )
                 else:
                     layer_outputs = encoder_layer(
                         hidden_states,
+                        attention_mask,
                         layer_head_mask=(
                             head_mask[idx] if head_mask is not None else None
                         ),

ultravox_processing.py CHANGED Viewed

@@ -1,12 +1,69 @@
-from typing import Optional, Union
 import numpy as np
 import torch
 import transformers
 from .ultravox_config import UltravoxConfig
 class UltravoxProcessor(transformers.ProcessorMixin):
     """
     Constructs an Ultravox processor which wraps an audio processor and a tokenizer into a single processor.
@@ -17,11 +74,7 @@ class UltravoxProcessor(transformers.ProcessorMixin):
     """
     attributes = ["audio_processor", "tokenizer"]
-    audio_processor_class = (
-        "Wav2Vec2Processor",
-        "SeamlessM4TFeatureExtractor",
-        "WhisperProcessor",
-    )
     tokenizer_class = (
         "PreTrainedTokenizer",
         "PreTrainedTokenizerFast",
@@ -35,41 +88,46 @@ class UltravoxProcessor(transformers.ProcessorMixin):
         audio_processor=None,
         tokenizer=None,
         audio_padding: str = "longest",
-        encoder_ds_factor: int = 320,
         stack_factor: int = 8,
         audio_placeholder: str = "<|audio|>",
     ):
         """
         Args:
             audio_processor: The audio processor for the audio encoder.
             tokenizer: The tokenizer for the language model.
             audio_padding: The padding strategy for the audio encoder.
-            encoder_ds_factor: The downsample factor of the audio encoder.
             stack_factor: The factor by which the audio encoder output is stacked in the multimodal projector.
             audio_placeholder: The placeholder for the audio in the text.
         """
         self.audio_padding = audio_padding
         self.encoder_ds_factor = encoder_ds_factor
         self.stack_factor = stack_factor
         self.audio_placeholder = audio_placeholder
-        self.audio_token_replacement = tokenizer.eos_token
         assert (
-            self.audio_token_replacement is not None
         ), "The tokenizer has no EOS token. Cannot recover."
         if tokenizer.pad_token_id is None:
             tokenizer.pad_token_id = tokenizer.eos_token_id
         super().__init__(audio_processor=audio_processor, tokenizer=tokenizer)
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         config: UltravoxConfig = transformers.AutoConfig.from_pretrained(
             pretrained_model_name_or_path, **kwargs
         )
         audio_processor = transformers.AutoProcessor.from_pretrained(
             config.audio_model_id
             or config.audio_config._name_or_path
-            or "facebook/wav2vec2-base-960h"
         )
         tokenizer = transformers.AutoTokenizer.from_pretrained(
@@ -84,30 +142,100 @@ class UltravoxProcessor(transformers.ProcessorMixin):
             stack_factor=config.stack_factor,
         )
     def __call__(
         self,
         text: Optional[str] = None,
         audio: Optional[Union[np.ndarray, torch.Tensor]] = None,
         sampling_rate: Optional[int] = None,
         return_tensors: Optional[
             Union[str, transformers.TensorType]
         ] = transformers.TensorType.PYTORCH,
         **kwargs,
     ) -> transformers.BatchFeature:
         """
         Main method to prepare for the model one text sequence and audio. This method forwards the `text`
         and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] if `text` is not `None` to encode
         the text. To prepare the audio(s), this method forwards the `audio`, `sampling_rate` and `kwargs` arguments to
-        audio processor's [`~Wav2Vec2Processor.__call__`] if `audio` is not `None`. Please refer to the docstring
         of the above two methods for more information.
         Args:
             text (`str`, `List[str]`):
                 The sequence to be encoded. Sequence can be a string or (pretokenized string).
             audio (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
-                The audio to be prepared. Audio can be NumPy array or PyTorch tensor. In case of a
-                NumPy array/PyTorch tensor, each audio should be of shape (C, T), where C is a number of channels, and T the
-                sample length of the audio.
             sampling_rate (`int`, *optional*, defaults to 16000):
                 Sampling rate of the input audio. We expect 16kHz audio. Don't change this value unless you know what
                 you are doing.
@@ -131,64 +259,105 @@ class UltravoxProcessor(transformers.ProcessorMixin):
               Returned when `audio` is not `None`.
             - **audio_token_start_idx** -- The index in the tokenized text where the audio starts. Returned when `audio` is not `None`.
         """
-        # TODO: Add support for multiple audio and text inputs.
         data = {}
-        audio_embed_frames = 0
-        if audio is not None and len(audio) > 0:
-            if self.audio_padding == "max_length":
-                # 30 seconds is the expected length for Whisper
-                assert sampling_rate is not None, "Sampling rate must be provided."
-                audio_len = 30 * sampling_rate
-            else:
-                audio_len = audio.shape[-1]
-            # It's guaranteed that the number of frames is less than or equal to this amount.
-            # For Whisper this is exact AFAICT, but for Wav2Vec2 it's an upper bound.
-            # Currently, StackAudioFrames makes sure an over-estimation won't cause issues by padding the audio embeddings.
-            nb_encoder_frames = int(round(audio_len / self.encoder_ds_factor + 1e-4))
-            audio_embed_frames = int(np.ceil(nb_encoder_frames / self.stack_factor))
-            data["audio_token_len"] = [audio_embed_frames]
             # Main audio processing. The processor is model-specific.
-            x = self.audio_processor(
-                audio,
                 sampling_rate=sampling_rate,
                 padding="longest",
-                max_length=audio_len,
                 **kwargs,
             )
-            if "input_features" in x:
-                data["audio_values"] = x.input_features
-            else:
-                data["audio_values"] = x.input_values
-        if text is not None:
-            assert isinstance(
-                text, str
-            ), "Text must be a string. Batch mode not supported yet."
-            if self.audio_placeholder in text:
-                if "audio_token_len" not in data:
-                    raise ValueError(
-                        f"audio must be provided when using audio placeholder ({self.audio_placeholder}) in text."
-                    )
-                start_idx = len(
-                    self.tokenizer.encode(
-                        text[: text.index(self.audio_placeholder)],
-                        add_special_tokens=False,
-                    )
-                )
-                data["audio_token_start_idx"] = [start_idx]
-                # Replace the audio placeholder with the audio token.
-                #   e.g. "Transcribe\n<|audio|>" -> "Transcribe </s></s></s></s></s></s></s></s>"
-                #        where the number of </s> is the number of audio frames.
-                text = text.replace(
-                    self.audio_placeholder,
-                    self.audio_token_replacement * audio_embed_frames,
                 )
             # Special tokens like BOS should already have been added by the caller.
-            data.update(self.tokenizer([text], add_special_tokens=False, **kwargs))
         return transformers.BatchFeature(data=data, tensor_type=return_tensors)

+import dataclasses
+from typing import Any, Dict, List, Optional, Union
 import numpy as np
 import torch
+import torch.nn.functional as F
 import transformers
 from .ultravox_config import UltravoxConfig
+@dataclasses.dataclass
+class DataCollatorForSeq2SeqWithAudio(transformers.DataCollatorForSeq2Seq):
+    # when enabled, the alt_input_ids, alt_attention_mask, and alt_labels fields are used for computing the KL loss in UltravoxModel
+    include_alt_fields: bool = False
+    def __call__(self, features, *args, **kwargs):
+        audio_values = [x for f in features for x in f.pop("audio_values", [])]
+        audio_lens = [x for f in features for x in f.pop("audio_lens", [])]
+        audio_token_len = [x for f in features for x in f.pop("audio_token_len", [])]
+        audio_token_start_idx = [
+            x for f in features for x in f.pop("audio_token_start_idx", [])
+        ]
+        if self.include_alt_fields:
+            # these fields are hard-coded in the transformer data collator, so they need special handling before calling the super method
+            alt_features = [
+                {
+                    "input_ids": f.pop("alt_input_ids"),
+                    "attention_mask": f.pop("alt_attention_mask"),
+                    "labels": f.pop("alt_labels"),
+                }
+                for f in features
+            ]
+        batch = super().__call__(features, *args, **kwargs)
+        if self.include_alt_fields:
+            alt_batch = super().__call__(alt_features, *args, **kwargs)
+            batch["alt_input_ids"] = alt_batch["input_ids"]
+            batch["alt_attention_mask"] = alt_batch["attention_mask"]
+            batch["alt_labels"] = alt_batch["labels"]
+        batch["audio_token_start_idx"] = torch.stack(audio_token_start_idx)
+        batch["audio_lens"] = torch.stack(audio_lens)
+        batch["audio_token_len"] = torch.stack(audio_token_len)
+        # Pad the last dimension of all audio_values to the same length, with 0s on the right.
+        if audio_values:
+            max_len = max([x.shape[-1] for x in audio_values])
+            batch["audio_values"] = torch.stack(
+                [F.pad(x, (0, max_len - x.shape[-1])) for x in audio_values]
+            )
+            if self.tokenizer.padding_side == "left":
+                input_ids_lens = torch.LongTensor(
+                    [f["input_ids"].shape[-1] for f in features]
+                )
+                displacement = batch["input_ids"].shape[-1] - input_ids_lens
+                displacement = displacement.repeat_interleave(
+                    batch["audio_batch_size"].squeeze(-1)
+                )
+                batch["audio_token_start_idx"] += displacement.to(
+                    batch["audio_token_start_idx"].device
+                )
+        return batch
 class UltravoxProcessor(transformers.ProcessorMixin):
     """
     Constructs an Ultravox processor which wraps an audio processor and a tokenizer into a single processor.
     """
     attributes = ["audio_processor", "tokenizer"]
+    audio_processor_class = ("WhisperProcessor",)
     tokenizer_class = (
         "PreTrainedTokenizer",
         "PreTrainedTokenizerFast",
         audio_processor=None,
         tokenizer=None,
         audio_padding: str = "longest",
+        encoder_ds_factor: int = 2,
         stack_factor: int = 8,
         audio_placeholder: str = "<|audio|>",
+        # Defaults to whisper encoder context size
+        audio_context_size: Optional[int] = 3000,
     ):
         """
         Args:
             audio_processor: The audio processor for the audio encoder.
             tokenizer: The tokenizer for the language model.
             audio_padding: The padding strategy for the audio encoder.
             stack_factor: The factor by which the audio encoder output is stacked in the multimodal projector.
+            encoder_ds_factor: The downsampling factor of the audio encoder.
             audio_placeholder: The placeholder for the audio in the text.
+            audio_context_size: The maximum number of frames that the audio encoder can handle.
         """
         self.audio_padding = audio_padding
         self.encoder_ds_factor = encoder_ds_factor
         self.stack_factor = stack_factor
         self.audio_placeholder = audio_placeholder
+        self.audio_context_size = audio_context_size
         assert (
+            tokenizer.eos_token is not None
         ), "The tokenizer has no EOS token. Cannot recover."
+        self.vocab = tokenizer.get_vocab()
+        self.audio_token_replacement = tokenizer.eos_token
         if tokenizer.pad_token_id is None:
             tokenizer.pad_token_id = tokenizer.eos_token_id
         super().__init__(audio_processor=audio_processor, tokenizer=tokenizer)
     @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
         config: UltravoxConfig = transformers.AutoConfig.from_pretrained(
             pretrained_model_name_or_path, **kwargs
         )
         audio_processor = transformers.AutoProcessor.from_pretrained(
             config.audio_model_id
             or config.audio_config._name_or_path
+            or "openai/whisper-tiny"
         )
         tokenizer = transformers.AutoTokenizer.from_pretrained(
             stack_factor=config.stack_factor,
         )
+    def _chunk_and_pad_audio(
+        self,
+        audio_values: torch.Tensor,
+        audio_lens: torch.Tensor,
+        include_audio_num_chunks: bool = False,
+    ) -> Dict[str, Any]:
+        """
+        Processes the audio batch by chunking any items in the batch according to the audio_context_size,
+        padding the last chunk if needed, and returns a dictionary with updated audio data.
+        Args:
+            audio_values (torch.Tensor): A tensor of audio values (e.g., in B, D, T format).
+            audio_lens (torch.Tensor): A tensor of audio lengths.
+        Returns:
+            Dict[str, Any]: Dictionary with the following keys:
+                - "audio_values": The concatenated audio tensor after chunking and padding.
+                - "audio_lens": Tensor of lengths for each chunk.
+                - "audio_is_continuation": Tensor of booleans indicating if the chunk is a continuation of the previous chunk.
+                - "audio_batch_size": A Tensor with one integer representing the number of chunks.
+        """
+        chunked_audio_values: List[torch.Tensor] = []
+        chunked_audio_lens: List[int] = []
+        is_continuation_list: List[bool] = []
+        num_chunks: List[int] = []
+        context_size = self.audio_context_size or audio_values.shape[-1]
+        for i in range(audio_values.shape[0]):  # iterate over the batch
+            num_chunks.append(int(np.ceil(audio_lens[i] / context_size)))
+            for offset in range(0, audio_lens[i], context_size):
+                is_continuation = offset > 0
+                chunk = audio_values[i, :, offset : offset + context_size]
+                if is_continuation and chunk.shape[-1] < context_size:
+                    # N.B. We only need to pad continuation chunks. If none of the samples require chunking, the
+                    # batch might not (need to) be padded all the way to the audio_context_size, in which case
+                    # we've already included the padding above. On the other hand, if we have any continuation
+                    # chunks we know that the batch needs to be padded to audio_context_size because that's what
+                    # we're slicing to.
+                    chunk = F.pad(chunk, (0, context_size - chunk.shape[-1]))
+                chunked_audio_values.append(chunk)
+                chunked_audio_lens.append(
+                    min(int(audio_lens[i].item()) - offset, context_size)
+                )
+                is_continuation_list.append(is_continuation)
+        data = {
+            "audio_values": torch.stack(chunked_audio_values, dim=0),
+            "audio_lens": torch.tensor(
+                chunked_audio_lens, dtype=torch.int64, device=audio_values.device
+            ),
+            "audio_is_continuation": torch.tensor(
+                is_continuation_list, dtype=torch.bool, device=audio_values.device
+            ),
+            "audio_batch_size": torch.tensor(
+                [len(chunked_audio_values)], device=audio_values.device
+            ),
+        }
+        if include_audio_num_chunks:
+            data["audio_num_chunks"] = torch.tensor(
+                num_chunks, dtype=torch.int64, device=audio_values.device
+            )
+        return data
     def __call__(
         self,
         text: Optional[str] = None,
         audio: Optional[Union[np.ndarray, torch.Tensor]] = None,
+        audios: Optional[
+            Union[
+                List[Union[np.ndarray, torch.Tensor]], Union[np.ndarray, torch.Tensor]
+            ]
+        ] = None,
         sampling_rate: Optional[int] = None,
         return_tensors: Optional[
             Union[str, transformers.TensorType]
         ] = transformers.TensorType.PYTORCH,
+        include_audio_num_chunks: bool = False,
         **kwargs,
     ) -> transformers.BatchFeature:
         """
         Main method to prepare for the model one text sequence and audio. This method forwards the `text`
         and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] if `text` is not `None` to encode
         the text. To prepare the audio(s), this method forwards the `audio`, `sampling_rate` and `kwargs` arguments to
+        audio processor's [`~WhisperProcessor.__call__`] if `audio` is not `None`. Please refer to the docstring
         of the above two methods for more information.
         Args:
             text (`str`, `List[str]`):
                 The sequence to be encoded. Sequence can be a string or (pretokenized string).
             audio (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The audio to be prepared. Audio can be a single-channel (1-dimensional) NumPy array or PyTorch tensor.
+            audios (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                A list or two dimensional array of audio to be prepared.
             sampling_rate (`int`, *optional*, defaults to 16000):
                 Sampling rate of the input audio. We expect 16kHz audio. Don't change this value unless you know what
                 you are doing.
               Returned when `audio` is not `None`.
             - **audio_token_start_idx** -- The index in the tokenized text where the audio starts. Returned when `audio` is not `None`.
         """
+        # TODO: Add support for multiple text inputs.
+        if audio is not None and audios is not None:
+            raise ValueError("Only one of `audio` or `audios` should be provided.")
+        elif audio is not None:
+            audios = audio if isinstance(audio, list) or audio.ndim == 2 else [audio]
+        elif audios is None:
+            audios = []
         data = {}
+        audio_is_continuation = []
+        if len(audios) > 0:
+            audios = [x.numpy() if isinstance(x, torch.Tensor) else x for x in audios]
+            # Pad out each audio to at least 2 hops (the minimum required by the processor).
+            hop_length = self.audio_processor.feature_extractor.hop_length
+            audios = [
+                (
+                    np.pad(x, (0, 2 * hop_length - len(x)), mode="constant")
+                    if len(x) < 2 * hop_length
+                    else x
+                )
+                for x in audios
+            ]
             # Main audio processing. The processor is model-specific.
+            x: transformers.BatchFeature = self.audio_processor(
+                audios,
                 sampling_rate=sampling_rate,
                 padding="longest",
+                pad_to_multiple_of=hop_length,  # The attention mask effectively gets padded to the hop length, so pad the audio to be consistent.
+                truncation=False,
+                return_attention_mask=True,
                 **kwargs,
             )
+            data.update(
+                self._chunk_and_pad_audio(
+                    audio_values=torch.as_tensor(
+                        x.input_features if "input_features" in x else x.input_values
+                    ),
+                    audio_lens=torch.as_tensor(x.attention_mask).sum(-1),
+                    include_audio_num_chunks=include_audio_num_chunks,
                 )
+            )
+            audio_is_continuation = data.pop("audio_is_continuation")
+            data["audio_token_len"] = torch.ceil(
+                data["audio_lens"] / (self.encoder_ds_factor * self.stack_factor)
+            ).to(dtype=torch.int)
+        if text is not None:
+            if not isinstance(text, str):
+                raise ValueError("Text must be a string. Batch mode not supported yet.")
             # Special tokens like BOS should already have been added by the caller.
+            tokenized_parts = self.tokenizer(
+                text.split(
+                    "<|audio|>"  # The placeholder isn't part of the vocabulary, so split the text around it.
+                ),
+                add_special_tokens=False,
+                **kwargs,
+            )
+            audio_token_start_idx = []
+            placeholder_index = -1
+            split_input_ids = tokenized_parts["input_ids"]
+            input_ids: List[int] = []
+            audio_token_replacement_token_id = self.vocab[self.audio_token_replacement]
+            for i, token_len in enumerate(data.get("audio_token_len", [])):
+                if not audio_is_continuation[i]:
+                    placeholder_index += 1
+                    if placeholder_index >= len(split_input_ids):
+                        raise ValueError(
+                            f"Text contains too few audio placeholders. (Expected {len(audios)} placeholders)"
+                        )
+                    input_ids.extend(split_input_ids[placeholder_index])
+                audio_token_start_idx.append(len(input_ids))
+                input_ids.extend([audio_token_replacement_token_id] * token_len)
+            # Include any tokens after the last audio.
+            placeholder_index += 1
+            if placeholder_index != len(split_input_ids) - 1:
+                raise ValueError(
+                    f"Text contains too many audio placeholders. (Expected {len(audios)} placeholders)"
+                )
+            input_ids.extend(split_input_ids[placeholder_index])
+            if "audio_token_len" in data:
+                data["audio_token_start_idx"] = torch.as_tensor(audio_token_start_idx)
+            data["input_ids"] = [input_ids]
+            data["attention_mask"] = [[1] * len(input_ids)]
+            # Ensure that there are no audio placeholders after the last audio.
         return transformers.BatchFeature(data=data, tensor_type=return_tensors)