Salesforce
/

xgen-mm-phi3-mini-instruct-interleave-r-v1.5

@@ -253,10 +253,10 @@
     "    for fn in sample['image_path']:\n",
     "        img = PIL.Image.open(fn)\n",
     "        display.display(Image(filename=fn, width=300))\n",
-    "        image_list.append(image_processor([img], image_aspect_ratio='anyres')[\"pixel_values\"])\n",
     "        image_sizes.append(img.size)\n",
     "    inputs = {\n",
-    "        \"pixel_values\": image_list\n",
     "    }\n",
     "    for query in sample['question']:\n",
     "        prompt = apply_prompt_template(query)\n",
@@ -266,9 +266,7 @@
     "        for name, value in inputs.items():\n",
     "            if isinstance(value, torch.Tensor):\n",
     "                inputs[name] = value.cuda()\n",
-    "            else:\n",
-    "                inputs[name] = [v.cuda() for v in value]\n",
-    "        generated_text = model.generate(**inputs, image_size=image_sizes,\n",
     "                                        pad_token_id=tokenizer.pad_token_id,\n",
     "                                        do_sample=False, max_new_tokens=1024, top_p=None, num_beams=1,\n",
     "                                        )\n",

     "    for fn in sample['image_path']:\n",
     "        img = PIL.Image.open(fn)\n",
     "        display.display(Image(filename=fn, width=300))\n",
+    "        image_list.append(image_processor([img], image_aspect_ratio='anyres')[\"pixel_values\"].cuda())\n",
     "        image_sizes.append(img.size)\n",
     "    inputs = {\n",
+    "        \"pixel_values\": [image_list]\n",
     "    }\n",
     "    for query in sample['question']:\n",
     "        prompt = apply_prompt_template(query)\n",
     "        for name, value in inputs.items():\n",
     "            if isinstance(value, torch.Tensor):\n",
     "                inputs[name] = value.cuda()\n",
+    "        generated_text = model.generate(**inputs, image_size=[image_sizes],\n",
     "                                        pad_token_id=tokenizer.pad_token_id,\n",
     "                                        do_sample=False, max_new_tokens=1024, top_p=None, num_beams=1,\n",
     "                                        )\n",

image_processing_blip_3.py CHANGED Viewed

@@ -109,26 +109,11 @@ class Blip3ImageProcessor(BaseImageProcessor):
         if all(x.shape == new_images[0].shape for x in new_images):
             new_images = torch.stack(new_images, dim=0)
-        if image_aspect_ratio == 'pad':
-            new_images = BatchFeature(data={"pixel_values": new_images.unsqueeze(0).unsqueeze(0)}, tensor_type=return_tensors)
-        else:
             new_images = BatchFeature(data={"pixel_values": new_images}, tensor_type=return_tensors)
         return new_images
-    # def preprocess(self,
-    #                images: ImageInput,
-    #                return_tensors: Optional[Union[str, TensorType]] = None,
-    #                **kwargs) -> BatchFeature:
-    #     transforms = self.resize(self.size,  self.resize_mode, self.interpolation_mode)
-    #     transforms.extend([
-    #         self.convert_rgb,
-    #         ToTensor(),
-    #         Normalize(mean=self.image_mean, std=self.image_std)
-    #     ])
-    #     composed_transforms = Compose(transforms)
-    #     images_tensor = composed_transforms(images).unsqueeze(0).unsqueeze(1).unsqueeze(0)
-    #     encoded_outputs = BatchFeature(data={"pixel_values": images_tensor}, tensor_type=return_tensors)
-    #     return encoded_outputs
 class ResizeKeepRatio:
     """ Resize and Keep Ratio

         if all(x.shape == new_images[0].shape for x in new_images):
             new_images = torch.stack(new_images, dim=0)
+        if image_aspect_ratio == 'anyres':
             new_images = BatchFeature(data={"pixel_values": new_images}, tensor_type=return_tensors)
+        else:
+            new_images = BatchFeature(data={"pixel_values": new_images.unsqueeze(1).unsqueeze(0)}, tensor_type=return_tensors)
         return new_images
 class ResizeKeepRatio:
     """ Resize and Keep Ratio

vlm.py CHANGED Viewed

@@ -1043,10 +1043,6 @@ class VLMWithLanguageStream(VLM):
                     multimodal_labels.append(labels[i].clone())
                 continue
-            # since an image is represented by self.num_tokens_per_vis tokens, we need to offset the image_token_idxs
-            for j, img_idx in enumerate(image_token_idxs):
-                image_token_idxs[j] += (self.num_tokens_per_vis - 1) * j # FIXME: different offset for any resolution encoding when has multiple images.
             # loop through the image_token_idxs and insert the vision tokens
             new_embed = lang_embeds[i].clone()
             new_attention_mask = (
@@ -1056,9 +1052,6 @@ class VLMWithLanguageStream(VLM):
                 new_label = labels[i].clone()
             for img_num, img_idx in enumerate(image_token_idxs):
-                if img_num > 0:
-                    # FIXME: hardcoded as such to avoid assertion error, but this only works for single image samples.
-                    break
                 # Get vision token attention mask for padded llava-style any resolution image tokens.
                 if self.image_aspect_ratio =='anyres':
                     num_vis_tokens = vision_tokens[i][img_num].shape[0]
@@ -1078,7 +1071,6 @@ class VLMWithLanguageStream(VLM):
                     vis_attention_mask = torch.ones(
                         num_vis_tokens, dtype=torch.long
                     ).to(attention_mask.device)
                 new_embed = torch.cat(
                     (
@@ -1275,123 +1267,6 @@ class XGenMMPerceiver(VLMWithLanguageStream):
         """
         return True
-    def forward(
-        self,
-        vision_x: Optional[torch.Tensor],
-        lang_x: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        image_size: Optional[Tuple] = None,
-        past_key_values: Optional[
-            List[Union[torch.Tensor, Tuple[torch.Tensor]]]
-        ] = None,
-        past_media_locations: Optional[torch.Tensor] = None,
-        past_vision_tokens: Optional[torch.Tensor] = None,
-        use_cache: Optional[bool] = False,
-        **kwargs,
-    ):
-        """
-        Args:
-            vision_x: Vision input
-                shape (B, T_img, F, C, H, W) with F=1
-                only F = 1 is supported (single-frame videos)
-                if T_img > the number of media tokens in the corresponding input_ids (lang_x),
-                only the first number of media tokens in lang_x are used
-            lang_x: Language input ids, with media tokens denoting where
-                visual media should be inserted.
-                shape (B, T_txt)
-            attention_mask: Attention mask. Defaults to None.
-            labels: Labels. Defaults to None.
-                shape (B, T_txt)
-            past_key_values (Tuple[torch.Tensor]], optional): Past key value pairs for each of the T_txt previous tokens in the language model. Defaults to None.
-                list of length = number of decoder layers in the LM
-                exact implementation depends on LM, see Hugging Face docs
-            past_media_locations (torch.Tensor, optional): boolean mask denoting which of the previous T_txt tokens were media tokens. Defaults to None.
-                shape (B, T_txt)
-            past_vision_tokens (torch.Tensor, optional): Previous vision tokens. Defaults to None.
-            use_cache (Optional[bool], optional): Whether to use cache. Defaults to False.
-                If True, includes key_values, media_locations, and vision_tokens in the output.
-        """
-        assert not (past_vision_tokens is None) ^ (
-            past_media_locations is None
-        ), "past_vision_tokens and past_media_locations must both be None or both be not None"
-        # convert pixels to vision tokens
-        vision_attention_mask = None
-        if vision_x is not None:
-            if self.image_aspect_ratio == 'anyres':
-                input_dict = dict(image=vision_x, image_size=image_size)
-                vision_features, vision_attn_masks = self._encode_vision_x_anyres(input_dict, lang_x.device)
-            else:
-                vision_features = self._encode_vision_x(vision_x=vision_x)
-                vision_attn_masks = None
-            # Same for attention masks: [b, Np, v] -> [b*Np, v]
-            if self.anyres_patch_sampling:
-                split_sizes = [feature.shape[0] for feature in vision_features]
-                # Nested splits for multi-image samples.
-                if isinstance(vision_x[0], list):
-                    nt_images = [len(images) for images in vision_x]
-                    split_split_sizes = []
-                    img_id = 0
-                    for nt in nt_images:
-                        split_split_sizes.append(split_sizes[img_id:img_id+nt])
-                        img_id += nt
-                else:
-                    nt_images = [1] * len(vision_x)
-                    split_split_sizes = split_sizes
-                vision_features = torch.cat(vision_features, dim=0)
-                vision_features = vision_features[:, None, None, :, :] # Expand dimensions.
-                vision_attn_masks = torch.cat(vision_attn_masks, dim=0)
-            # TODO: add an option that allows restoring the T dimension for video tokenization.
-            vision_tokens = self.vision_tokenizer(vision_features, vision_attn_masks)
-            # Post-processing: Split the batches into groups of patches and concatenate them together.
-            if self.anyres_patch_sampling:
-                # assert isinstance(vision_x, list)
-                if isinstance(vision_x[0], list):
-                    vision_token_groups = torch.split(vision_tokens, list(sum(nt_img) for nt_img in split_split_sizes), dim=0)
-                    vision_tokens = []
-                    for sample_id, patch_vis_tokens in enumerate(vision_token_groups):
-                        patch_vis_token_groups =  torch.split(patch_vis_tokens, split_split_sizes[sample_id], dim=0) # [Np*nt, 1, v, d] -> [[Np_t, 1, v, d], ...]
-                        flatten_vision_tokens = []
-                        for image_vis_token in patch_vis_token_groups:
-                            image_vis_token = image_vis_token.flatten(0, 2) # [Np, 1, v, d] -> [Np*v, d]
-                            flatten_vision_tokens.append(image_vis_token)
-                        vision_tokens_i = flatten_vision_tokens
-                        vision_tokens.append(vision_tokens_i)
-                else:
-                    vision_token_groups = torch.split(vision_tokens, split_sizes, dim=0)
-                    vision_tokens = []
-                    for patch_vis_tokens in vision_token_groups:
-                        patch_vis_tokens = patch_vis_tokens.flatten(0, 2) # [Np, 1, v, d] -> [Np*v, d]
-                        vision_tokens.append(patch_vis_tokens.unsqueeze(0)) # Add the nt dimension.
-        else:
-            vision_tokens = None
-        # fuse the vision and language tokens
-        new_inputs = self._prepare_inputs_for_forward(
-            vision_tokens=vision_tokens,
-            lang_x=lang_x,
-            attention_mask=attention_mask,
-            vision_attention_mask=vision_attention_mask,
-            labels=labels,
-            past_key_values=past_key_values,
-            past_media_locations=past_media_locations,
-            padding_side="right",
-            past_vision_tokens=past_vision_tokens,
-        )
-        output = self.lang_model(
-            **new_inputs,
-            use_cache=use_cache,
-            past_key_values=past_key_values,
-            **kwargs,
-        )
-        # postforward hooks
-        self._post_forward_hook()
-        return output
     def generate(
         self,
         vision_x: torch.Tensor,
@@ -1429,7 +1304,7 @@ class XGenMMPerceiver(VLMWithLanguageStream):
             else:
                 vision_features = self._encode_vision_x(vision_x=vision_x)
                 vision_attn_masks = None
-            # TODO: If doing patch sampling, then flatten patches of shape [b, Np_i, v, d] -> [b*Np, v, d]
             # Same for attention masks: [b, Np, v] -> [b*Np, v]
             if self.anyres_patch_sampling:
                 split_sizes = [feature.shape[0] for feature in vision_features]

                     multimodal_labels.append(labels[i].clone())
                 continue
             # loop through the image_token_idxs and insert the vision tokens
             new_embed = lang_embeds[i].clone()
             new_attention_mask = (
                 new_label = labels[i].clone()
             for img_num, img_idx in enumerate(image_token_idxs):
                 # Get vision token attention mask for padded llava-style any resolution image tokens.
                 if self.image_aspect_ratio =='anyres':
                     num_vis_tokens = vision_tokens[i][img_num].shape[0]
                     vis_attention_mask = torch.ones(
                         num_vis_tokens, dtype=torch.long
                     ).to(attention_mask.device)
                 new_embed = torch.cat(
                     (
         """
         return True
     def generate(
         self,
         vision_x: torch.Tensor,
             else:
                 vision_features = self._encode_vision_x(vision_x=vision_x)
                 vision_attn_masks = None
+            # If doing patch sampling, then flatten patches of shape [b, Np_i, v, d] -> [b*Np, v, d]
             # Same for attention masks: [b, Np, v] -> [b*Np, v]
             if self.anyres_patch_sampling:
                 split_sizes = [feature.shape[0] for feature in vision_features]