Upload files with `vila-upload`.

Browse files

Upload configuration_vila.py
Upload config.json
Upload auto_processor.py
Upload modeling_vila.py

Files changed (4) hide show

auto_processor.py +10 -8
config.json +3 -3
configuration_vila.py +2 -2
modeling_vila.py +13 -11

auto_processor.py CHANGED Viewed

@@ -153,7 +153,9 @@ class VILAProcessor(ProcessorMixin):
     # image_processor_class = "VILAImageProcessor"
     # tokenizer_class = ("VILATokenizer", "VILATokenizerFast")
-    def __init__(self, image_processor=None, tokenizer=None, chat_template=None, config=None, padding_side="left", **kwargs):
         self.image_token = MEDIA_TOKENS["image"]
         self.video_token = MEDIA_TOKENS["video"]
         self.config = config
@@ -161,11 +163,10 @@ class VILAProcessor(ProcessorMixin):
         self.tokenizer = tokenizer
         self.padding_side = padding_side
-        # This is a special setting for Qwen.
         # self.pad_token_id = tokenizer.pad_token_id
-        self.pad_token_id = self.tokenizer("<|endoftext|>").input_ids[0] # 151643
         self.eos_token_id = self.tokenizer.eos_token_id
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
     @staticmethod
@@ -243,6 +244,7 @@ class VILAProcessor(ProcessorMixin):
         else:
             print(f"pretrained_model_name_or_path {pretrained_model_name_or_path} is not a directory, downloading")
             from huggingface_hub import snapshot_download
             pretrained_model_name_or_path = snapshot_download(pretrained_model_name_or_path)
         image_processor = AutoImageProcessor.from_pretrained(
@@ -293,7 +295,7 @@ class VILAProcessor(ProcessorMixin):
                 media[name] += feat.media[name]
             for name in feat.media_config:
                 media_config[name].update(feat.media_config[name])
         # pad the input_ids to batchfy
         input_ids = pad_fn(
             input_ids_list,
@@ -354,18 +356,18 @@ class VILAProcessor(ProcessorMixin):
                         images = images.half()
                         media_config[name]["block_sizes"] = [block_sizes]
                 else:
-                    images = process_images(media["image"], self.vision_tower.image_processor, self.config).half()
                 media[name] = [image for image in images]
             elif name == "video":
                 media[name] = [
-                    process_images(images, self.vision_tower.image_processor, self.config).half()
-                    for images in media[name]
                 ]
             else:
                 raise ValueError(f"Unsupported media type: {name}")
         inputs = tokenize_conversation(conversation, self.tokenizer, add_generation_prompt=True, return_ids_only=False)
         input_ids = inputs.input_ids[0].unsqueeze(0).cuda()
         attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
         return BatchFeature(
             data={

     # image_processor_class = "VILAImageProcessor"
     # tokenizer_class = ("VILATokenizer", "VILATokenizerFast")
+    def __init__(
+        self, image_processor=None, tokenizer=None, chat_template=None, config=None, padding_side="left", **kwargs
+    ):
         self.image_token = MEDIA_TOKENS["image"]
         self.video_token = MEDIA_TOKENS["video"]
         self.config = config
         self.tokenizer = tokenizer
         self.padding_side = padding_side
+        # This is a special setting for Qwen.
         # self.pad_token_id = tokenizer.pad_token_id
+        self.pad_token_id = self.tokenizer("<|endoftext|>").input_ids[0]  # 151643
         self.eos_token_id = self.tokenizer.eos_token_id
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
     @staticmethod
         else:
             print(f"pretrained_model_name_or_path {pretrained_model_name_or_path} is not a directory, downloading")
             from huggingface_hub import snapshot_download
             pretrained_model_name_or_path = snapshot_download(pretrained_model_name_or_path)
         image_processor = AutoImageProcessor.from_pretrained(
                 media[name] += feat.media[name]
             for name in feat.media_config:
                 media_config[name].update(feat.media_config[name])
         # pad the input_ids to batchfy
         input_ids = pad_fn(
             input_ids_list,
                         images = images.half()
                         media_config[name]["block_sizes"] = [block_sizes]
                 else:
+                    images = process_images(media["image"], self.image_processor, self.config).half()
                 media[name] = [image for image in images]
             elif name == "video":
                 media[name] = [
+                    process_images(images, self.image_processor, self.config).half() for images in media[name]
                 ]
             else:
                 raise ValueError(f"Unsupported media type: {name}")
         inputs = tokenize_conversation(conversation, self.tokenizer, add_generation_prompt=True, return_ids_only=False)
         input_ids = inputs.input_ids[0].unsqueeze(0).cuda()
         attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
         return BatchFeature(
             data={

config.json CHANGED Viewed

@@ -2,12 +2,12 @@
   "_attn_implementation_autoset": true,
   "_name_or_path": "NVILA-Lite-8B-hf-preview",
   "architectures": [
-    "VILAForCasualLM"
   ],
   "auto_map": {
     "AutoConfig": "configuration_vila.VILAConfig",
-    "AutoModel": "modeling_vila.VILAForCasualLM",
-    "AutoModelForCausalLM": "modeling_vila.VILAForCasualLM",
     "AutoProcessor": "auto_processor.VILAProcessor"
   },
   "chat_template": null,

   "_attn_implementation_autoset": true,
   "_name_or_path": "NVILA-Lite-8B-hf-preview",
   "architectures": [
+    "VILAForCausalLM"
   ],
   "auto_map": {
     "AutoConfig": "configuration_vila.VILAConfig",
+    "AutoModel": "modeling_vila.VILAForCausalLM",
+    "AutoModelForCausalLM": "modeling_vila.VILAForCausalLM",
     "AutoProcessor": "auto_processor.VILAProcessor"
   },
   "chat_template": null,

configuration_vila.py CHANGED Viewed

@@ -57,7 +57,8 @@ class VILAConfig(PretrainedConfig):
         video_encoder: str = '{"_target_": "llava.model.encoders.BasicVideoEncoder"}',
         **kwargs,
     ):
-        super().__init__()
         self.architectures = architectures
         self.llm_cfg = llm_cfg
         self.vision_tower_cfg = vision_tower_cfg
@@ -90,4 +91,3 @@ class VILAConfig(PretrainedConfig):
         self.image_encoder = image_encoder
         self.video_encoder = video_encoder
-        super().__init__(**kwargs)

         video_encoder: str = '{"_target_": "llava.model.encoders.BasicVideoEncoder"}',
         **kwargs,
     ):
+        super().__init__(**kwargs)
         self.architectures = architectures
         self.llm_cfg = llm_cfg
         self.vision_tower_cfg = vision_tower_cfg
         self.image_encoder = image_encoder
         self.video_encoder = video_encoder

modeling_vila.py CHANGED Viewed

@@ -59,6 +59,7 @@ from .utils import get_model_config, load_tokenizer_then_handle_media_tokens_and
 # ease debugging
 python_input = input
 # quick hack for remote code
 def get_pg_manager():
     return None
@@ -191,6 +192,7 @@ class VILAPretrainedModel(PreTrainedModel):
     main_input_name = "input_embeds"
     supports_gradient_checkpointing = True
     _supports_flash_attn_2 = True
     def __init__(self, config: VILAConfig, *args, **kwargs):
         super().__init__(config)
@@ -268,12 +270,12 @@ class VILAPretrainedModel(PreTrainedModel):
         cfg_path = os.path.join(output_dir, "config.json")
         config = json.load(open(cfg_path))
         config["version"] = "2.0"  # nvila tag
-        config["architectures"] = ["VILAForCasualLM"]
         config["auto_map"] = {
             "AutoProcessor": "auto_processor.VILAProcessor",
             "AutoConfig": "modeling_vila.VILAConfig",
-            "AutoModel": "modeling_vila.VILAForCasualLM",
-            "AutoModelForCausalLM": "modeling_vila.VILAForCasualLM",
         }
         # vila1.5 legacy support
         config["model_type"] = "vila"
@@ -501,7 +503,7 @@ class VILAPretrainedModel(PreTrainedModel):
                 self.get_mm_projector().eval()
-class VILAForCasualLM(VILAPretrainedModel):
     def __init__(self, config: VILAConfig, *args, **kwargs):
         super().__init__(config, *args, **kwargs)
@@ -1082,7 +1084,8 @@ class VILAForCasualLM(VILAPretrainedModel):
         return outputs
-    @torch.inference_mode()
     def generate(
         self,
         input_ids: Optional[torch.FloatTensor] = None,
@@ -1100,14 +1103,13 @@ class VILAForCasualLM(VILAPretrainedModel):
         input_emds:     <media emd>   001 002 003 004
         """
         # NOTE: hard code to move to GPU
-        input_ids = input_ids.cuda()
-        media = {k: [v.cuda() for v in media[k]] for k in media}
-        if attention_mask is not None:
-            attention_mask = attention_mask.cuda()
         inputs_embeds, _, attention_mask = self._embed(input_ids, media, media_config, None, attention_mask)
         output_ids = self.llm.generate(inputs_embeds=inputs_embeds, attention_mask=attention_mask, **generation_kwargs)
         if return_output_ids_only:
             return_value = output_ids
         else:

 # ease debugging
 python_input = input
 # quick hack for remote code
 def get_pg_manager():
     return None
     main_input_name = "input_embeds"
     supports_gradient_checkpointing = True
     _supports_flash_attn_2 = True
+    _no_split_modules = ["Qwen2DecoderLayer", "SiglipEncoderLayer"]
     def __init__(self, config: VILAConfig, *args, **kwargs):
         super().__init__(config)
         cfg_path = os.path.join(output_dir, "config.json")
         config = json.load(open(cfg_path))
         config["version"] = "2.0"  # nvila tag
+        config["architectures"] = ["VILAForCausalLM"]
         config["auto_map"] = {
             "AutoProcessor": "auto_processor.VILAProcessor",
             "AutoConfig": "modeling_vila.VILAConfig",
+            "AutoModel": "modeling_vila.VILAForCausalLM",
+            "AutoModelForCausalLM": "modeling_vila.VILAForCausalLM",
         }
         # vila1.5 legacy support
         config["model_type"] = "vila"
                 self.get_mm_projector().eval()
+class VILAForCausalLM(VILAPretrainedModel):
     def __init__(self, config: VILAConfig, *args, **kwargs):
         super().__init__(config, *args, **kwargs)
         return outputs
+    # TODO(ligeng): check how qwen implements this function
+    # @torch.inference_mode()
     def generate(
         self,
         input_ids: Optional[torch.FloatTensor] = None,
         input_emds:     <media emd>   001 002 003 004
         """
         # NOTE: hard code to move to GPU
+        # input_ids = input_ids.cuda()
+        # media = {k: [v.cuda() if v is not None for v in media[k]] for k in media}
+        # if attention_mask is not None:
+        #     attention_mask = attention_mask.cuda()
         inputs_embeds, _, attention_mask = self._embed(input_ids, media, media_config, None, attention_mask)
         output_ids = self.llm.generate(inputs_embeds=inputs_embeds, attention_mask=attention_mask, **generation_kwargs)
         if return_output_ids_only:
             return_value = output_ids
         else: