Create custom processor for easier inference

#11

by pcuenq HF Staff - opened 12 days ago

base: refs/heads/main

←

from: refs/pr/11

Discussion Files changed

+179

-52

Files changed (6) hide show

README.md +26 -40
config.json +22 -3
llava_qwen.py +9 -9
preprocessor_config.json +31 -0
processing_fastvlm.py +88 -0
processor_config.json +3 -0

README.md CHANGED Viewed

@@ -55,56 +55,42 @@ python predict.py --model-path /path/to/checkpoint-dir \
 To run inference with transformers we can leverage `trust_remote_code` along with the following snippet:
 ```python
-import torch
-from PIL import Image
-from transformers import AutoTokenizer, AutoModelForCausalLM
-MID = "apple/FastVLM-0.5B"
-IMAGE_TOKEN_INDEX = -200  # what the model code looks for
-# Load
-tok = AutoTokenizer.from_pretrained(MID, trust_remote_code=True)
 model = AutoModelForCausalLM.from_pretrained(
-    MID,
-    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-    device_map="auto",
     trust_remote_code=True,
 )
-# Build chat -> render to string (not tokens) so we can place <image> exactly
 messages = [
-    {"role": "user", "content": "<image>\nDescribe this image in detail."}
 ]
-rendered = tok.apply_chat_template(
-    messages, add_generation_prompt=True, tokenize=False
-)
-pre, post = rendered.split("<image>", 1)
-# Tokenize the text *around* the image token (no extra specials!)
-pre_ids  = tok(pre,  return_tensors="pt", add_special_tokens=False).input_ids
-post_ids = tok(post, return_tensors="pt", add_special_tokens=False).input_ids
-# Splice in the IMAGE token id (-200) at the placeholder position
-img_tok = torch.tensor([[IMAGE_TOKEN_INDEX]], dtype=pre_ids.dtype)
-input_ids = torch.cat([pre_ids, img_tok, post_ids], dim=1).to(model.device)
-attention_mask = torch.ones_like(input_ids, device=model.device)
-# Preprocess image via the model's own processor
-img = Image.open("test-2.jpg").convert("RGB")
-px = model.get_vision_tower().image_processor(images=img, return_tensors="pt")["pixel_values"]
-px = px.to(model.device, dtype=model.dtype)
-# Generate
-with torch.no_grad():
-    out = model.generate(
-        inputs=input_ids,
-        attention_mask=attention_mask,
-        images=px,
-        max_new_tokens=128,
-    )
-print(tok.decode(out[0], skip_special_tokens=True))
 ```
 ## Citation
@@ -117,4 +103,4 @@ If you found this model useful, please cite the following paper:
   month = {June},
   year = {2025},
 }
-```

 To run inference with transformers we can leverage `trust_remote_code` along with the following snippet:
 ```python
+from transformers import AutoModelForCausalLM, AutoProcessor
+model_id = "apple/FastVLM-0.5B"
+processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 model = AutoModelForCausalLM.from_pretrained(
+    model_id,
     trust_remote_code=True,
 )
+image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
 messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image", "image": image_url},
+            {"type": "text", "text": "Describe this image in detail."},
+        ]
+    }
 ]
+inputs = processor.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_tensors="pt",
+    return_dict=True,
+)
+out = model.generate(
+    **inputs,
+    do_sample=False,
+    max_new_tokens=150,
+)
+print(processor.tokenizer.decode(out[0], skip_special_tokens=False))
 ```
 ## Citation
   month = {June},
   year = {2025},
 }
+```

config.json CHANGED Viewed

@@ -1,12 +1,12 @@
 {
-  "_name_or_path": "./llava-v1.5-13b",
   "architectures": [
     "LlavaQwen2ForCausalLM"
   ],
   "auto_map": {
     "AutoConfig": "llava_qwen.LlavaConfig",
-    "AutoModelForCausalLM": "llava_qwen.LlavaQwen2ForCausalLM"
-  },
   "attention_dropout": 0.0,
   "bos_token_id": 151643,
   "eos_token_id": 151645,
@@ -45,5 +45,24 @@
   "use_cache": true,
   "use_mm_proj": true,
   "use_sliding_window": false,
   "vocab_size": 151936
 }

 {
   "architectures": [
     "LlavaQwen2ForCausalLM"
   ],
   "auto_map": {
     "AutoConfig": "llava_qwen.LlavaConfig",
+    "AutoModelForCausalLM": "llava_qwen.LlavaQwen2ForCausalLM",
+    "AutoProcessor": "processing_fastvlm.FastVLMProcessor"
+  },
   "attention_dropout": 0.0,
   "bos_token_id": 151643,
   "eos_token_id": 151645,
   "use_cache": true,
   "use_mm_proj": true,
   "use_sliding_window": false,
+  "vision_config": {
+    "cls_ratio": 2.0,
+    "down_patch_size": 7,
+    "down_stride": 2,
+    "downsamples": [true, true, true, true, true],
+    "embed_dims": [96, 192, 384, 768, 1536],
+    "hidden_size": 1024,
+    "image_size": 1024,
+    "intermediate_size": 3072,
+    "layer_scale_init_value": 1e-5,
+    "layers": [2, 12, 24, 4, 2],
+    "mlp_ratios": [4, 4, 4, 4, 4],
+    "num_classes": 1000,
+    "patch_size": 64,
+    "pos_embs_shapes": [null, null, null, [7, 7], [7, 7]],
+    "projection_dim": 768,
+    "repmixer_kernel_size": 3,
+    "token_mixers": ["repmixer", "repmixer", "repmixer", "attention", "attention"]
+  },
   "vocab_size": 151936
 }

llava_qwen.py CHANGED Viewed

@@ -2140,8 +2140,8 @@ class LlavaQwen2ForCausalLM(Qwen2ForCausalLM, LlavaMetaForCausalLM):
     @torch.no_grad()
     def generate(
         self,
-        inputs: Optional[torch.Tensor] = None,
-        images: Optional[torch.Tensor] = None,
         image_sizes: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> Union[GenerateOutput, torch.LongTensor]:
@@ -2150,21 +2150,21 @@ class LlavaQwen2ForCausalLM(Qwen2ForCausalLM, LlavaMetaForCausalLM):
         if "inputs_embeds" in kwargs:
             raise NotImplementedError("`inputs_embeds` is not supported")
-        if images is not None:
             (
-                inputs,
                 position_ids,
                 attention_mask,
                 _,
                 inputs_embeds,
                 _
             ) = self.prepare_inputs_labels_for_multimodal(
-                inputs,
                 position_ids,
                 attention_mask,
                 None,
                 None,
-                images,
                 image_sizes=image_sizes
             )
         else:
@@ -2179,17 +2179,17 @@ class LlavaQwen2ForCausalLM(Qwen2ForCausalLM, LlavaMetaForCausalLM):
     def prepare_inputs_for_generation(self, input_ids, past_key_values=None,
                                       inputs_embeds=None, **kwargs):
-        images = kwargs.pop("images", None)
         image_sizes = kwargs.pop("image_sizes", None)
         inputs = super().prepare_inputs_for_generation(
             input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
         )
         if images is not None:
-            inputs['images'] = images
         if image_sizes is not None:
             inputs['image_sizes'] = image_sizes
         return inputs
 AutoConfig.register("llava_qwen2", LlavaConfig)
-AutoModelForCausalLM.register(LlavaConfig, LlavaQwen2ForCausalLM)

     @torch.no_grad()
     def generate(
         self,
+        input_ids: Optional[torch.Tensor] = None,
+        pixel_values: Optional[torch.Tensor] = None,
         image_sizes: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> Union[GenerateOutput, torch.LongTensor]:
         if "inputs_embeds" in kwargs:
             raise NotImplementedError("`inputs_embeds` is not supported")
+        if pixel_values is not None:
             (
+                input_ids,
                 position_ids,
                 attention_mask,
                 _,
                 inputs_embeds,
                 _
             ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids,
                 position_ids,
                 attention_mask,
                 None,
                 None,
+                pixel_values,
                 image_sizes=image_sizes
             )
         else:
     def prepare_inputs_for_generation(self, input_ids, past_key_values=None,
                                       inputs_embeds=None, **kwargs):
+        images = kwargs.pop("pixel_values", None)
         image_sizes = kwargs.pop("image_sizes", None)
         inputs = super().prepare_inputs_for_generation(
             input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
         )
         if images is not None:
+            inputs['pixel_values'] = images
         if image_sizes is not None:
             inputs['image_sizes'] = image_sizes
         return inputs
 AutoConfig.register("llava_qwen2", LlavaConfig)
+AutoModelForCausalLM.register(LlavaConfig, LlavaQwen2ForCausalLM)

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+    "auto_map": {
+        "AutoImageProcessor": "processing_fastvlm.FastVLMImageProcessor"
+    },
+    "image_processor_type": "FastVLMImageProcessor",
+    "crop_size": {
+        "height": 1024,
+        "width": 1024
+    },
+    "do_center_crop": true,
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+        0.0,
+        0.0,
+        0.0
+    ],
+    "image_std": [
+        1.0,
+        1.0,
+        1.0
+    ],
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "size": {
+    "shortest_edge": 1024
+    }
+}

processing_fastvlm.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import re
+import torch
+from transformers import ProcessorMixin, BatchFeature, CLIPImageProcessorFast
+from transformers.image_processing_utils import BaseImageProcessor
+from transformers.image_utils import ImageInput
+from typing import Any, Dict, List, Optional, Union
+from PIL import Image
+from .llava_qwen import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
+# Adapted from transformers.models.llava_next.image_processing_llava_next.expand_to_square
+def expand_to_square(image: torch.Tensor, background_color=0) -> torch.Tensor:
+    """
+    Expands an image to a square by adding a background color.
+    """
+    c, height, width = image.shape
+    if width == height:
+        return image
+    elif width > height:
+        result = torch.ones((c, width, width), dtype=image.dtype) * background_color
+        result[:, (width - height) // 2 : (width - height) // 2 + height, :] = image
+        return result
+    else:
+        result = torch.ones((c, height, height), dtype=image.dtype) * background_color
+        result[:, :, (height - width) // 2 : (height - width) // 2 + width] = image
+        return result
+class FastVLMImageProcessor(CLIPImageProcessorFast):
+    def _preprocess(self, images, **kwargs):
+        image_sizes = [image.shape[-2:][::-1] for image in images]
+        images = [expand_to_square(image) for image in images]
+        images = super()._preprocess(images, **kwargs)
+        pixel_values = torch.stack(images.pixel_values, dim=0)
+        return BatchFeature(data={"pixel_values": pixel_values, "image_sizes": image_sizes})
+class FastVLMProcessor(ProcessorMixin):
+    attributes = ["tokenizer", "image_processor"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+    def __init__(
+        self,
+        tokenizer,
+        image_processor,
+        chat_template=None,
+        **kwargs
+    ):
+        super().__init__(tokenizer, image_processor, chat_template=chat_template, **kwargs)
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Optional[Union[str, List[str]]] = None,
+        return_tensors: Optional[str] = "pt",
+        **kwargs,
+    ) -> BatchFeature:
+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise TypeError("Invalid input text. Please provide a string, or a list of strings")
+        image_inputs = {}
+        if images is not None:
+            image_inputs = self.image_processor(images=images)
+            image_token = torch.tensor([[IMAGE_TOKEN_INDEX]], dtype=torch.int64)
+            input_ids = torch.tensor([], dtype=torch.int64)
+            attention_mask = torch.tensor([], dtype=torch.int64)
+            for prompt in text:
+                image_indexes = [m.start() for m in re.finditer(DEFAULT_IMAGE_TOKEN, prompt)]
+                if len(image_indexes) > 1:
+                    raise ValueError(
+                        f"Expected up to 1 image tokens per prompt, got {len(image_indexes)} instead."
+                    )
+                # DEFAULT_IMAGE_TOKEN is -200, not in the vocab (so we can't tokenize the full string)
+                pre, _, post = prompt.partition(DEFAULT_IMAGE_TOKEN)
+                pre_ids  = self.tokenizer(pre, return_tensors="pt", add_special_tokens=False).input_ids
+                post_ids = self.tokenizer(post, return_tensors="pt", add_special_tokens=False).input_ids
+                sample_ids = torch.cat([pre_ids, image_token, post_ids], dim=1).to(dtype=torch.int64)
+                sample_mask = torch.ones_like(sample_ids)
+                input_ids = torch.cat([input_ids, sample_ids], dim=0)
+                attention_mask = torch.cat([attention_mask, sample_mask], dim=0)
+        return BatchFeature(data={"input_ids": input_ids, "attention_mask": attention_mask, **image_inputs}, tensor_type=return_tensors)

processor_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+    "chat_template": "{%- if messages is string -%}\n    {{- messages -}}\n{%- else -%}\n    {%- for message in messages -%}\n        {%- if loop.first and messages[0]['role'] != 'system' -%}\n            {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' -}}\n        {%- endif -%}\n        {{- '<|im_start|>' + message['role'] + '\\n' -}}\n        {%- if message['content'] is string -%}\n            {{- message['content'] -}}\n        {%- elif message['content'] is iterable -%}\n            {%- for item in message['content'] -%}\n                {%- if item['type'] == 'image' -%}\n                    {{- '<image>\\n' -}}\n                {%- elif item['type'] == 'text' -%}\n                    {{- item['text'] -}}\n                {%- endif -%}\n            {%- endfor -%}\n        {%- else -%}\n            {{- raise_exception(\"Invalid content type\") -}}\n        {%- endif -%}\n        {{- '<|im_end|>' + '\\n' -}}\n    {%- endfor -%}\n    {%- if add_generation_prompt -%}\n        {{- '<|im_start|>assistant\\n' -}}\n    {%- endif -%}\n{%- endif -%}\n"
+}