Upload files with `vila-upload`.

Browse files

Upload processing_vila.py
Upload modeling_vila.py

Files changed (2) hide show

modeling_vila.py +39 -35
processing_vila.py +267 -207

modeling_vila.py CHANGED Viewed

@@ -1,9 +1,10 @@
-from typing import List, Optional, Type
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from torch import Tensor
 from transformers.configuration_utils import PretrainedConfig
 from transformers.generation.utils import GenerationMixin
 from transformers.modeling_outputs import BaseModelOutputWithPooling, CausalLMOutputWithPast
@@ -55,23 +56,22 @@ class MultimodalProjector(nn.Module):
     ):
         super().__init__(*args, **kwargs)
-        match config.mm_projector_type:
-            case "mlp_downsample_3x3_fix":
-                self.layers = nn.Sequential(
-                    DownSample3x3BlockFix(),
-                    nn.LayerNorm(config.mm_hidden_size * 9),
-                    nn.Linear(
-                        config.mm_hidden_size * 9,
-                        config.mm_hidden_size * 3,
-                    ),
-                    nn.GELU(),
-                    nn.LayerNorm(config.vision_config.hidden_size * 3),
-                    nn.Linear(config.vision_config.hidden_size * 3, config.hidden_size),
-                    nn.GELU(),
-                    nn.Linear(config.hidden_size, config.hidden_size),
-                )
-            case _:
-                raise NotImplementedError(f"Unsupported mm_projector_type: {config.mm_projector_type}")
         self.layers.type(config.torch_dtype)
@@ -131,22 +131,29 @@ class VILAForConditionalGeneration(PreTrainedModel, GenerationMixin):
         attention_mask: Optional[Tensor] = None,
         input_ids: Optional[Tensor] = None,
         inputs_embeds: Optional[Tensor] = None,
         pixel_values: Optional[Tensor] = None,
         **kwargs,
     ) -> CausalLMOutputWithPast:
-        # Vision info is only used for prefilling.
-        if kwargs.get("past_key_values", None) is not None:
-            pixel_values = None
-        if inputs_embeds is None:
-            if input_ids is None:
-                raise ValueError("input_ids is required when inputs_embeds is None")
-            inputs_embeds = self._embed(input_ids, pixel_values)
         outputs = self.llm.__call__(
-            inputs_embeds=inputs_embeds.to(device=self.llm.device, dtype=self.llm.dtype),
             attention_mask=(attention_mask.to(device=self.llm.device) if attention_mask is not None else None),
             **kwargs,
         )
@@ -208,10 +215,7 @@ class VILAForConditionalGeneration(PreTrainedModel, GenerationMixin):
         selected_layer_hidden_states = vision_tower_output.hidden_states[self.config.mm_vision_select_layer]
-        match self.config.mm_vision_select_feature:
-            case "cls_patch":
-                return selected_layer_hidden_states
-            case _:
-                raise NotImplementedError(
-                    f"Unsupported mm_vision_select_feature: {self.config.mm_vision_select_feature}"
-                )

+from typing import List, Optional, Type, Union
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from torch import LongTensor, Tensor
+from transformers.cache_utils import Cache
 from transformers.configuration_utils import PretrainedConfig
 from transformers.generation.utils import GenerationMixin
 from transformers.modeling_outputs import BaseModelOutputWithPooling, CausalLMOutputWithPast
     ):
         super().__init__(*args, **kwargs)
+        if config.mm_projector_type == "mlp_downsample_3x3_fix":
+            self.layers = nn.Sequential(
+                DownSample3x3BlockFix(),
+                nn.LayerNorm(config.mm_hidden_size * 9),
+                nn.Linear(
+                    config.mm_hidden_size * 9,
+                    config.mm_hidden_size * 3,
+                ),
+                nn.GELU(),
+                nn.LayerNorm(config.vision_config.hidden_size * 3),
+                nn.Linear(config.vision_config.hidden_size * 3, config.hidden_size),
+                nn.GELU(),
+                nn.Linear(config.hidden_size, config.hidden_size),
+            )
+        else:
+            raise NotImplementedError(f"Unsupported mm_projector_type: {config.mm_projector_type}")
         self.layers.type(config.torch_dtype)
         attention_mask: Optional[Tensor] = None,
         input_ids: Optional[Tensor] = None,
         inputs_embeds: Optional[Tensor] = None,
+        past_key_values: Optional[Cache] = None,
         pixel_values: Optional[Tensor] = None,
+        position_ids: Optional[LongTensor] = None,
+        logits_to_keep: Union[int, Tensor] = 0,
         **kwargs,
     ) -> CausalLMOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds.")
+        if past_key_values is None:  # Prefill
+            if input_ids is not None:
+                inputs_embeds = self._embed(input_ids, pixel_values)
+                input_ids = None
         outputs = self.llm.__call__(
             attention_mask=(attention_mask.to(device=self.llm.device) if attention_mask is not None else None),
+            input_ids=(input_ids.to(device=self.llm.device) if input_ids is not None else None),
+            inputs_embeds=(
+                inputs_embeds.to(device=self.llm.device, dtype=self.llm.dtype) if inputs_embeds is not None else None
+            ),
+            past_key_values=past_key_values,
+            position_ids=(position_ids.to(device=self.llm.device) if position_ids is not None else None),
+            logits_to_keep=logits_to_keep,
             **kwargs,
         )
         selected_layer_hidden_states = vision_tower_output.hidden_states[self.config.mm_vision_select_layer]
+        if self.config.mm_vision_select_feature == "cls_patch":
+            return selected_layer_hidden_states
+        else:
+            raise NotImplementedError(f"Unsupported mm_vision_select_feature: {self.config.mm_vision_select_feature}")

processing_vila.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from typing import List, Optional, Tuple, cast
 import transformers.image_transforms as image_transforms
@@ -14,7 +15,7 @@ from transformers.models.siglip.image_processing_siglip import SiglipImageProces
 from transformers.models.siglip.image_processing_siglip_fast import SiglipImageProcessorFast
 from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from transformers.tokenization_utils import PreTrainedTokenizer
-from transformers.tokenization_utils_base import PreTrainedTokenizerBase, TextInput
 from transformers.video_utils import VideoInput
 logger = transformers.utils.logging.get_logger(__name__)
@@ -83,7 +84,6 @@ class VILAProcessor(ProcessorMixin):
         text: TextInput | List[TextInput],
         images: Optional[ImageInput] = None,
         videos: Optional[VideoInput] = None,
-        audio: None = None,
         **kwargs: Unpack[ProcessingKwargs],
     ) -> VILAProcessorOutput:
         """Preprocesses inputs for VILA.
@@ -92,7 +92,6 @@ class VILAProcessor(ProcessorMixin):
             text: The text to be processed.
             images: The images to be processed.
             videos: The videos to be processed.
-            audio: Not available.
             **kwargs: Additional arguments for processing.
         Returns:
@@ -105,58 +104,32 @@ class VILAProcessor(ProcessorMixin):
             **kwargs,
         )
-        prepared_text, prepared_images, prepared_videos = self._prepare_inputs(
             text=text,
             images=images,
             videos=videos,
         )
-        # Process videos.
-        prepared_text, prepared_images, video_flags = self._treat_videos_as_image_seqs(
-            text=prepared_text,
-            images=prepared_images,
-            videos=prepared_videos,
         )
-        # Process images.
-        image_inputs, num_cropped_images = self._process_images(
-            images=prepared_images,
-            video_flags=video_flags,
-            **merged_kwargs["images_kwargs"],
-        )
-        # Process text.
-        prepared_text = self._pad_image_tokens_by_num_crops(
-            prepared_text,
-            num_cropped_images=num_cropped_images,
-            video_flags=video_flags,
-        )
-        prepared_text = self._pad_image_tokens_by_num_embeddings(prepared_text)
         text_inputs = self.tokenizer.__call__(
-            prepared_text,
             **merged_kwargs["text_kwargs"],
         )
-        # Find the last image token of each image tile and replace to "\n".
-        lf_token_id = self.tokenizer.encode("\n")[0]
-        image_token_id = self.tokenizer.image_token_id
-        for i in range(len(text_inputs.input_ids)):
-            input_ids = text_inputs.input_ids[i]
-            idx = 0
-            while idx < len(input_ids):
-                if input_ids[idx] != image_token_id:
-                    idx += 1
-                    continue
-                if idx + self.image_pad_len < len(input_ids):
-                    input_ids[idx + self.image_pad_len] = lf_token_id
-                    idx += self.image_pad_len + 1
-                else:
-                    break
         return VILAProcessorOutput(
             data={
@@ -165,119 +138,118 @@ class VILAProcessor(ProcessorMixin):
             }
         )
-    def _crop_image(
-        self,
-        image: Image,
-        *,
-        is_video_frame: bool,
-    ) -> List[Image]:
-        """Crops the image into multiple tiles.
         Args:
-            image: The image to be cropped.
-        Returns:
-            The cropped images.
-        """
-        # TODO: Support more image processors.
-        if not isinstance(self.image_processor, (SiglipImageProcessor, SiglipImageProcessorFast)):
-            raise NotImplementedError
-        assert self.image_processor.size["height"] == self.image_processor.size["width"]
-        cropped_size = self.image_processor.size["height"]
-        cropped_images: List[Image] = dynamic_preprocess(
-            image,
-            min_num=self.min_tiles,
-            max_num=self.max_tiles if not is_video_frame else self.video_max_tiles,
-            image_size=cropped_size,
-        )
-        return cropped_images
-    def _pad_image_tokens_by_num_crops(
-        self,
-        text: List[str],
-        *,
-        num_cropped_images: List[int],
-        video_flags: List[bool],
-    ) -> List[str]:
-        """Pads each \\<image> to num_cropped_images of "\\<image>\\n" for images and "\\<video>" for videos.
-        Args:
-            text: The text to be padded.
-            num_cropped_images: The number of cropped images for each image token.
-            video_flags: A list of flags indicating whether the num_cropped_images item is a video.
         Returns:
-            The padded text.
         """
-        assert len(num_cropped_images) == len(
-            video_flags
-        ), "num_cropped_images and video_flags must have the same length."
-        image_token: str = cast(str, self.tokenizer.image_token)
-        return_text: List[str] = []
         for text_item in text:
-            return_text_item: str = ""
-            # Repeatedly find image_token in the text.
-            while image_token in text_item:
                 image_pos = text_item.find(image_token)
-                if image_pos != -1 and len(num_cropped_images) > 0:
-                    num_crops = num_cropped_images.pop(0)
-                    video_flag = video_flags.pop(0)
-                    return_text_item += (
-                        text_item[:image_pos] + (image_token if video_flag else (image_token + "\n")) * num_crops
-                    )
-                    text_item = text_item[image_pos + len(image_token) :]
-                else:
                     break
-            # Must place outside the while loop.
-            if image_token in text_item:
-                raise ValueError("Too many image tokens in the text.")
-            return_text_item += text_item
-            text_item = ""
-            return_text.append(return_text_item)
-        if len(num_cropped_images) != 0:
-            raise ValueError("Too many images provided.")
-        return return_text
-    def _pad_image_tokens_by_num_embeddings(
         self,
-        text: List[str],
-    ) -> List[str]:
-        """Pads each \\<image> to image_pad_len times of "\\<image>".
         Args:
-            text: The text to be padded.
         Returns:
-            The padded text.
         """
         image_token = cast(str, self.tokenizer.image_token)
-        return [text_item.replace(image_token, image_token * (self.image_pad_len + 1)) for text_item in text]
-    @staticmethod
-    def _prepare_inputs(
         text: TextInput | List[TextInput],
         images: Optional[ImageInput],
         videos: Optional[VideoInput],
     ) -> Tuple[List[str], List[Image], List[List[Image]]]:
         prepared_text = text if isinstance(text, list) else [text]
         if images is not None:
@@ -296,117 +268,205 @@ class VILAProcessor(ProcessorMixin):
         return prepared_text, prepared_images, prepared_videos
-    def _process_images(
         self,
-        images: List[Image],
-        *,
-        video_flags: List[bool],
-        **kwargs,
-    ) -> Tuple[BatchFeature, List[int]]:
-        cropped_images: List[Image] = []
-        num_cropped_images: List[int] = []
-        for image, video_flag in zip(images, video_flags):
-            single_cropped_images = self._crop_image(image, is_video_frame=video_flag)
-            cropped_images.extend(single_cropped_images)
-            num_cropped_images.append(len(single_cropped_images))
-        if len(cropped_images) == 0:
-            # The image processor may not properly handle empty image lists.
-            # This is a workaround to avoid errors.
-            return BatchFeature(), num_cropped_images
-        image_inputs = self.image_processor.__call__(
-            cropped_images,
-            **kwargs,
-        )
-        return image_inputs, num_cropped_images
-    def _treat_videos_as_image_seqs(
-        self, text: List[str], images: List[Image], videos: List[List[Image]]
-    ) -> Tuple[List[str], List[Image], List[bool]]:
-        """Treats videos as image sequences.
-        This method will replace all video tokens in the text with #frame image tokens,
-        and insert the corresponding images into the images list.
         Args:
-            text: The text to be processed.
-            images: The images to be processed.
-            videos: The videos to be processed.
         Returns:
-            The processed text and images, and a list of flags indicating whether the images are from videos.
         """
         image_token = cast(str, self.tokenizer.image_token)
-        video_token = cast(str, self.tokenizer.video_token)
-        return_text: List[str] = []
-        return_images: List[Image] = []
-        return_video_flags: List[bool] = []
-        for text_item in text:
-            return_text_item: str = ""
-            # Repeatedly find image_token or video_token in the text.
-            while image_token in text_item or video_token in text_item:
-                image_pos = text_item.find(image_token)
-                video_pos = text_item.find(video_token)
-                # If not found, set position to the end of the text.
-                if image_pos == -1:
-                    image_pos = len(text_item)
-                if video_pos == -1:
-                    video_pos = len(text_item)
-                if image_pos != len(text_item) and len(images) > 0 and image_pos < video_pos:
-                    # Take an image and keep the image token if:
-                    #   - an image token is found, and
-                    #   - there are images left, and
-                    #   - the image token is before the first video token.
-                    image = images.pop(0)
-                    return_images.append(image)
-                    return_video_flags.append(False)
-                    return_text_item += text_item[: image_pos + len(image_token)]
-                    text_item = text_item[image_pos + len(image_token) :]
-                elif video_pos != len(text_item) and len(videos) > 0 and video_pos < image_pos:
-                    # Take a video and replace the video token with #frame image tokens if:
-                    #   - a video token is found, and
-                    #   - there are videos left, and
-                    #   - the video token is before the first image token.
-                    video = videos.pop(0)
-                    return_images.extend(video)
-                    return_video_flags.extend([True] * len(video))
-                    return_text_item += text_item[:video_pos] + image_token * len(video)
-                    text_item = text_item[video_pos + len(video_token) :]
-                else:
                     break
-            # Must place outside the while loop.
-            if image_token in text_item:
-                raise ValueError("Too many image tokens in the text.")
-            if video_token in text_item:
-                raise ValueError("Too many video tokens in the text.")
-            return_text_item += text_item
-            text_item = ""
-            return_text.append(return_text_item)
-        if len(images) != 0:
-            raise ValueError("Too many images provided.")
-        if len(videos) != 0:
-            raise ValueError("Too many videos provided.")
-        return return_text, return_images, return_video_flags
 def dynamic_preprocess(image: Image, min_num: int, max_num: int, image_size: int, use_thumbnail=True) -> List[Image]:

+import uuid
 from typing import List, Optional, Tuple, cast
 import transformers.image_transforms as image_transforms
 from transformers.models.siglip.image_processing_siglip_fast import SiglipImageProcessorFast
 from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase, TextInput
 from transformers.video_utils import VideoInput
 logger = transformers.utils.logging.get_logger(__name__)
         text: TextInput | List[TextInput],
         images: Optional[ImageInput] = None,
         videos: Optional[VideoInput] = None,
         **kwargs: Unpack[ProcessingKwargs],
     ) -> VILAProcessorOutput:
         """Preprocesses inputs for VILA.
             text: The text to be processed.
             images: The images to be processed.
             videos: The videos to be processed.
             **kwargs: Additional arguments for processing.
         Returns:
             **kwargs,
         )
+        normalized_text, normalized_images, normalized_videos = self._normalize_inputs(
             text=text,
             images=images,
             videos=videos,
         )
+        preprocessed_text, preprocessed_media_tiles = self._preprocess_inputs(
+            text=normalized_text,
+            images=normalized_images,
+            videos=normalized_videos,
         )
         text_inputs = self.tokenizer.__call__(
+            preprocessed_text,
             **merged_kwargs["text_kwargs"],
         )
+        if len(preprocessed_media_tiles) > 0:
+            image_inputs = self.image_processor.__call__(
+                preprocessed_media_tiles,
+                **merged_kwargs["images_kwargs"],
+            )
+        else:
+            image_inputs = BatchFeature()
+        text_inputs = self._replace_image_tile_suffix(text_inputs)
         return VILAProcessorOutput(
             data={
             }
         )
+    def _find_media_token_order(self, text: List[str]) -> List[str]:
+        """Finds the order of media tokens in the text.
         Args:
+            text: The text to be processed.
         Returns:
+            The order of media tokens in the text. Each item is either an image token or a video
+            token.
         """
+        image_token = cast(str, self.tokenizer.image_token)
+        video_token = cast(str, self.tokenizer.video_token)
+        return_order: List[str] = []
         for text_item in text:
+            while image_token in text_item or video_token in text_item:
                 image_pos = text_item.find(image_token)
+                video_pos = text_item.find(video_token)
+                if image_pos == -1 and video_pos == -1:
+                    # If no media token found, move to the next text item.
                     break
+                elif image_pos == -1:
+                    # If only video token found, add it to the return order.
+                    return_order.append(video_token)
+                    text_item = text_item[video_pos + len(video_token) :]
+                elif video_pos == -1:
+                    # If only image token found, add it to the return order.
+                    return_order.append(image_token)
+                    text_item = text_item[image_pos + len(image_token) :]
+                else:
+                    # If both tokens found, choose the one that appears first.
+                    if image_pos < video_pos:
+                        return_order.append(image_token)
+                        text_item = text_item[image_pos + len(image_token) :]
+                    else:
+                        return_order.append(video_token)
+                        text_item = text_item[video_pos + len(video_token) :]
+        return return_order
+    def _generate_image_token_placeholder(self, text: List[str]) -> str:
+        while True:
+            placeholder = f"<|image_placeholder_{str(uuid.uuid4())}|>"
+            if all(placeholder not in text_item for text_item in text):
+                return placeholder
+    def _merge_media_tiles(
         self,
+        image_tiles: List[List[Image]],
+        video_tiles: List[List[List[Image]]],
+        media_token_order: List[str],
+    ) -> List[Image]:
+        """Merges the media tiles by the media token order.
         Args:
+            image_tiles: The image tiles.
+            video_tiles: The video tiles.
+            media_token_order: The order of media tokens in the text.
         Returns:
+            The merged media tiles.
         """
         image_token = cast(str, self.tokenizer.image_token)
+        video_token = cast(str, self.tokenizer.video_token)
+        image_tiles_idx = 0
+        video_tiles_idx = 0
+        return_tiles: List[Image] = []
+        for media_token in media_token_order:
+            if media_token == image_token:
+                return_tiles.extend(image_tiles[image_tiles_idx])
+                image_tiles_idx += 1
+            elif media_token == video_token:
+                for video_tile in video_tiles[video_tiles_idx]:
+                    return_tiles.extend(video_tile)
+                video_tiles_idx += 1
+            else:
+                raise ValueError(f"Invalid media token: {media_token}")
+        return return_tiles
+    def _normalize_inputs(
+        self,
         text: TextInput | List[TextInput],
         images: Optional[ImageInput],
         videos: Optional[VideoInput],
     ) -> Tuple[List[str], List[Image], List[List[Image]]]:
+        """Normalizes text, image, and video inputs for processing.
+        This method converts various input formats into standardized lists of PIL images
+        and text strings that can be processed by the model.
+        Args:
+            text: The original input text.
+            images: The original input images.
+            videos: The original input videos.
+        Returns:
+            The text as a list of strings.
+            The images as a list of PIL images.
+            The videos as a list of lists of PIL images.
+        """
         prepared_text = text if isinstance(text, list) else [text]
         if images is not None:
         return prepared_text, prepared_images, prepared_videos
+    def _pad_image_tiles(
         self,
+        text: List[str],
+    ) -> List[str]:
+        """Pads each media tile.
+        This will pad each <image> to (self.image_pad_len + 1) times. The additional one padding is
+        for the \\n token suffix.
+        Args:
+            text: The text to be padded.
+        Returns:
+            The padded text.
+        """
+        image_token = cast(str, self.tokenizer.image_token)
+        return [text_item.replace(image_token, image_token * (self.image_pad_len + 1)) for text_item in text]
+    def _preprocess_inputs(
+        self,
+        text: List[str],
+        images: List[Image],
+        videos: List[List[Image]],
+    ) -> Tuple[List[str], List[Image]]:
+        """Preprocesses the input data for the VILA model.
+        This method takes a list of texts, images, and videos, and prepares them for the model.
+        It handles the interleaving of text and media, and returns the processed text and a
+        list of media tiles (images or video frames).
         Args:
+            text: The input text.
+            images: The input images.
+            videos: The input videos.
         Returns:
+            The text ready to be tokenized.
+            The media tiles ready to be processed.
         """
+        media_token_order = self._find_media_token_order(text)
+        image_token_placeholder = self._generate_image_token_placeholder(text)
+        preprocessed_text = text
+        preprocessed_text, preprocessed_image_tiles = self._preprocess_images(
+            preprocessed_text,
+            images,
+            image_token_placeholder=image_token_placeholder,
+        )
+        preprocessed_text, preprocessed_video_tiles = self._preprocess_videos(
+            preprocessed_text,
+            videos,
+            image_token_placeholder=image_token_placeholder,
+        )
+        # Convert back to the original image token.
         image_token = cast(str, self.tokenizer.image_token)
+        preprocessed_text = [text_item.replace(image_token_placeholder, image_token) for text_item in preprocessed_text]
+        preprocessed_text = self._pad_image_tiles(preprocessed_text)
+        preprocessed_media_tiles = self._merge_media_tiles(
+            preprocessed_image_tiles,
+            preprocessed_video_tiles,
+            media_token_order,
+        )
+        return preprocessed_text, preprocessed_media_tiles
+    def _preprocess_images(
+        self,
+        text: List[str],
+        images: List[Image],
+        *,
+        image_token_placeholder: str,
+    ) -> Tuple[List[str], List[List[Image]]]:
+        single_image_token_placeholder = self._generate_image_token_placeholder(text)
+        preprocessed_text = text
+        preprocessed_image_tiles: List[List[Image]] = []
+        for image in images:
+            preprocessed_text, preprocessed_single_image_tiles = self._preprocess_single_image(
+                text,
+                image,
+                image_token_placeholder=single_image_token_placeholder,
+                is_video_frame=False,
+                use_dynamic_preprocess=(len(images) == 1),
+            )
+            preprocessed_text = [
+                text_item.replace(
+                    single_image_token_placeholder,
+                    (image_token_placeholder + "\n") if len(images) == 1 else image_token_placeholder,
+                )
+                for text_item in preprocessed_text
+            ]
+            preprocessed_image_tiles.append(preprocessed_single_image_tiles)
+        return preprocessed_text, preprocessed_image_tiles
+    def _preprocess_single_image(
+        self,
+        text: List[str],
+        image: Image,
+        *,
+        image_token_placeholder: str,
+        is_video_frame: bool,
+        use_dynamic_preprocess: bool,
+    ) -> Tuple[List[str], List[Image]]:
+        assert isinstance(self.image_processor, (SiglipImageProcessor, SiglipImageProcessorFast))
+        assert self.image_processor.size["height"] == self.image_processor.size["width"]
+        cropped_size = self.image_processor.size["height"]
+        if use_dynamic_preprocess:
+            if is_video_frame:
+                max_num = self.video_max_tiles
+            else:
+                max_num = self.max_tiles
+        else:
+            max_num = 1
+        image = image.convert("RGB")
+        cropped_images: List[Image] = dynamic_preprocess(
+            image,
+            min_num=self.min_tiles,
+            max_num=max_num,
+            image_size=cropped_size,
+        )
+        image_token = cast(str, self.tokenizer.image_token)
+        for i in range(len(text)):
+            if image_token in text[i]:
+                text[i] = text[i].replace(image_token, image_token_placeholder * len(cropped_images))
+                break
+        return text, cropped_images
+    def _preprocess_videos(
+        self,
+        text: List[str],
+        videos: List[List[Image]],
+        *,
+        image_token_placeholder: str,
+    ) -> Tuple[List[str], List[List[List[Image]]]]:
+        image_token = cast(str, self.tokenizer.image_token)
+        video_token = cast(str, self.tokenizer.video_token)
+        processed_text = text
+        processed_video_tiles: List[List[List[Image]]] = []
+        for video in videos:
+            # Replace the first video token with #frame image tokens.
+            for i in range(len(processed_text)):
+                if video_token in processed_text[i]:
+                    processed_text[i] = processed_text[i].replace(video_token, image_token * len(video))
                     break
+            processed_frame_tiles: List[List[Image]] = []
+            for frame in video:
+                processed_text, processed_single_frame_tiles = self._preprocess_single_image(
+                    processed_text,
+                    frame,
+                    image_token_placeholder=image_token_placeholder,
+                    is_video_frame=True,
+                    use_dynamic_preprocess=(self.video_max_tiles > 1),
+                )
+                processed_frame_tiles.append(processed_single_frame_tiles)
+            processed_video_tiles.append(processed_frame_tiles)
+        return processed_text, processed_video_tiles
+    def _replace_image_tile_suffix(self, text_inputs: BatchEncoding) -> BatchEncoding:
+        lf_token_id = cast(int, self.tokenizer.encode("\n")[0])
+        image_token_id = cast(int, self.tokenizer.image_token_id)
+        for i in range(len(text_inputs.input_ids)):
+            input_ids = text_inputs.input_ids[i]
+            idx = 0
+            while idx < len(input_ids):
+                if input_ids[idx] != image_token_id:
+                    idx += 1
+                    continue
+                if idx + self.image_pad_len < len(input_ids):
+                    input_ids[idx + self.image_pad_len] = lf_token_id
+                    idx += self.image_pad_len + 1
+                else:
+                    break
+        return text_inputs
 def dynamic_preprocess(image: Image, min_num: int, max_num: int, image_size: int, use_thumbnail=True) -> List[Image]: