Upload files with `vila-upload`.
Browse filesUpload configuration_vila.py
Upload config.json
Upload auto_processor.py
Upload modeling_vila.py
- auto_processor.py +10 -8
- config.json +3 -3
- configuration_vila.py +2 -2
- modeling_vila.py +13 -11
auto_processor.py
CHANGED
|
@@ -153,7 +153,9 @@ class VILAProcessor(ProcessorMixin):
|
|
| 153 |
# image_processor_class = "VILAImageProcessor"
|
| 154 |
# tokenizer_class = ("VILATokenizer", "VILATokenizerFast")
|
| 155 |
|
| 156 |
-
def __init__(
|
|
|
|
|
|
|
| 157 |
self.image_token = MEDIA_TOKENS["image"]
|
| 158 |
self.video_token = MEDIA_TOKENS["video"]
|
| 159 |
self.config = config
|
|
@@ -161,11 +163,10 @@ class VILAProcessor(ProcessorMixin):
|
|
| 161 |
self.tokenizer = tokenizer
|
| 162 |
self.padding_side = padding_side
|
| 163 |
|
| 164 |
-
# This is a special setting for Qwen.
|
| 165 |
# self.pad_token_id = tokenizer.pad_token_id
|
| 166 |
-
self.pad_token_id = self.tokenizer("<|endoftext|>").input_ids[0]
|
| 167 |
self.eos_token_id = self.tokenizer.eos_token_id
|
| 168 |
-
|
| 169 |
super().__init__(image_processor, tokenizer, chat_template=chat_template)
|
| 170 |
|
| 171 |
@staticmethod
|
|
@@ -243,6 +244,7 @@ class VILAProcessor(ProcessorMixin):
|
|
| 243 |
else:
|
| 244 |
print(f"pretrained_model_name_or_path {pretrained_model_name_or_path} is not a directory, downloading")
|
| 245 |
from huggingface_hub import snapshot_download
|
|
|
|
| 246 |
pretrained_model_name_or_path = snapshot_download(pretrained_model_name_or_path)
|
| 247 |
|
| 248 |
image_processor = AutoImageProcessor.from_pretrained(
|
|
@@ -293,7 +295,7 @@ class VILAProcessor(ProcessorMixin):
|
|
| 293 |
media[name] += feat.media[name]
|
| 294 |
for name in feat.media_config:
|
| 295 |
media_config[name].update(feat.media_config[name])
|
| 296 |
-
|
| 297 |
# pad the input_ids to batchfy
|
| 298 |
input_ids = pad_fn(
|
| 299 |
input_ids_list,
|
|
@@ -354,18 +356,18 @@ class VILAProcessor(ProcessorMixin):
|
|
| 354 |
images = images.half()
|
| 355 |
media_config[name]["block_sizes"] = [block_sizes]
|
| 356 |
else:
|
| 357 |
-
images = process_images(media["image"], self.
|
| 358 |
media[name] = [image for image in images]
|
| 359 |
elif name == "video":
|
| 360 |
media[name] = [
|
| 361 |
-
process_images(images, self.
|
| 362 |
-
for images in media[name]
|
| 363 |
]
|
| 364 |
else:
|
| 365 |
raise ValueError(f"Unsupported media type: {name}")
|
| 366 |
|
| 367 |
inputs = tokenize_conversation(conversation, self.tokenizer, add_generation_prompt=True, return_ids_only=False)
|
| 368 |
input_ids = inputs.input_ids[0].unsqueeze(0).cuda()
|
|
|
|
| 369 |
attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
|
| 370 |
return BatchFeature(
|
| 371 |
data={
|
|
|
|
| 153 |
# image_processor_class = "VILAImageProcessor"
|
| 154 |
# tokenizer_class = ("VILATokenizer", "VILATokenizerFast")
|
| 155 |
|
| 156 |
+
def __init__(
|
| 157 |
+
self, image_processor=None, tokenizer=None, chat_template=None, config=None, padding_side="left", **kwargs
|
| 158 |
+
):
|
| 159 |
self.image_token = MEDIA_TOKENS["image"]
|
| 160 |
self.video_token = MEDIA_TOKENS["video"]
|
| 161 |
self.config = config
|
|
|
|
| 163 |
self.tokenizer = tokenizer
|
| 164 |
self.padding_side = padding_side
|
| 165 |
|
| 166 |
+
# This is a special setting for Qwen.
|
| 167 |
# self.pad_token_id = tokenizer.pad_token_id
|
| 168 |
+
self.pad_token_id = self.tokenizer("<|endoftext|>").input_ids[0] # 151643
|
| 169 |
self.eos_token_id = self.tokenizer.eos_token_id
|
|
|
|
| 170 |
super().__init__(image_processor, tokenizer, chat_template=chat_template)
|
| 171 |
|
| 172 |
@staticmethod
|
|
|
|
| 244 |
else:
|
| 245 |
print(f"pretrained_model_name_or_path {pretrained_model_name_or_path} is not a directory, downloading")
|
| 246 |
from huggingface_hub import snapshot_download
|
| 247 |
+
|
| 248 |
pretrained_model_name_or_path = snapshot_download(pretrained_model_name_or_path)
|
| 249 |
|
| 250 |
image_processor = AutoImageProcessor.from_pretrained(
|
|
|
|
| 295 |
media[name] += feat.media[name]
|
| 296 |
for name in feat.media_config:
|
| 297 |
media_config[name].update(feat.media_config[name])
|
| 298 |
+
|
| 299 |
# pad the input_ids to batchfy
|
| 300 |
input_ids = pad_fn(
|
| 301 |
input_ids_list,
|
|
|
|
| 356 |
images = images.half()
|
| 357 |
media_config[name]["block_sizes"] = [block_sizes]
|
| 358 |
else:
|
| 359 |
+
images = process_images(media["image"], self.image_processor, self.config).half()
|
| 360 |
media[name] = [image for image in images]
|
| 361 |
elif name == "video":
|
| 362 |
media[name] = [
|
| 363 |
+
process_images(images, self.image_processor, self.config).half() for images in media[name]
|
|
|
|
| 364 |
]
|
| 365 |
else:
|
| 366 |
raise ValueError(f"Unsupported media type: {name}")
|
| 367 |
|
| 368 |
inputs = tokenize_conversation(conversation, self.tokenizer, add_generation_prompt=True, return_ids_only=False)
|
| 369 |
input_ids = inputs.input_ids[0].unsqueeze(0).cuda()
|
| 370 |
+
|
| 371 |
attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
|
| 372 |
return BatchFeature(
|
| 373 |
data={
|
config.json
CHANGED
|
@@ -2,12 +2,12 @@
|
|
| 2 |
"_attn_implementation_autoset": true,
|
| 3 |
"_name_or_path": "NVILA-Lite-8B-hf-preview",
|
| 4 |
"architectures": [
|
| 5 |
-
"
|
| 6 |
],
|
| 7 |
"auto_map": {
|
| 8 |
"AutoConfig": "configuration_vila.VILAConfig",
|
| 9 |
-
"AutoModel": "modeling_vila.
|
| 10 |
-
"AutoModelForCausalLM": "modeling_vila.
|
| 11 |
"AutoProcessor": "auto_processor.VILAProcessor"
|
| 12 |
},
|
| 13 |
"chat_template": null,
|
|
|
|
| 2 |
"_attn_implementation_autoset": true,
|
| 3 |
"_name_or_path": "NVILA-Lite-8B-hf-preview",
|
| 4 |
"architectures": [
|
| 5 |
+
"VILAForCausalLM"
|
| 6 |
],
|
| 7 |
"auto_map": {
|
| 8 |
"AutoConfig": "configuration_vila.VILAConfig",
|
| 9 |
+
"AutoModel": "modeling_vila.VILAForCausalLM",
|
| 10 |
+
"AutoModelForCausalLM": "modeling_vila.VILAForCausalLM",
|
| 11 |
"AutoProcessor": "auto_processor.VILAProcessor"
|
| 12 |
},
|
| 13 |
"chat_template": null,
|
configuration_vila.py
CHANGED
|
@@ -57,7 +57,8 @@ class VILAConfig(PretrainedConfig):
|
|
| 57 |
video_encoder: str = '{"_target_": "llava.model.encoders.BasicVideoEncoder"}',
|
| 58 |
**kwargs,
|
| 59 |
):
|
| 60 |
-
super().__init__()
|
|
|
|
| 61 |
self.architectures = architectures
|
| 62 |
self.llm_cfg = llm_cfg
|
| 63 |
self.vision_tower_cfg = vision_tower_cfg
|
|
@@ -90,4 +91,3 @@ class VILAConfig(PretrainedConfig):
|
|
| 90 |
self.image_encoder = image_encoder
|
| 91 |
self.video_encoder = video_encoder
|
| 92 |
|
| 93 |
-
super().__init__(**kwargs)
|
|
|
|
| 57 |
video_encoder: str = '{"_target_": "llava.model.encoders.BasicVideoEncoder"}',
|
| 58 |
**kwargs,
|
| 59 |
):
|
| 60 |
+
super().__init__(**kwargs)
|
| 61 |
+
|
| 62 |
self.architectures = architectures
|
| 63 |
self.llm_cfg = llm_cfg
|
| 64 |
self.vision_tower_cfg = vision_tower_cfg
|
|
|
|
| 91 |
self.image_encoder = image_encoder
|
| 92 |
self.video_encoder = video_encoder
|
| 93 |
|
|
|
modeling_vila.py
CHANGED
|
@@ -59,6 +59,7 @@ from .utils import get_model_config, load_tokenizer_then_handle_media_tokens_and
|
|
| 59 |
# ease debugging
|
| 60 |
python_input = input
|
| 61 |
|
|
|
|
| 62 |
# quick hack for remote code
|
| 63 |
def get_pg_manager():
|
| 64 |
return None
|
|
@@ -191,6 +192,7 @@ class VILAPretrainedModel(PreTrainedModel):
|
|
| 191 |
main_input_name = "input_embeds"
|
| 192 |
supports_gradient_checkpointing = True
|
| 193 |
_supports_flash_attn_2 = True
|
|
|
|
| 194 |
|
| 195 |
def __init__(self, config: VILAConfig, *args, **kwargs):
|
| 196 |
super().__init__(config)
|
|
@@ -268,12 +270,12 @@ class VILAPretrainedModel(PreTrainedModel):
|
|
| 268 |
cfg_path = os.path.join(output_dir, "config.json")
|
| 269 |
config = json.load(open(cfg_path))
|
| 270 |
config["version"] = "2.0" # nvila tag
|
| 271 |
-
config["architectures"] = ["
|
| 272 |
config["auto_map"] = {
|
| 273 |
"AutoProcessor": "auto_processor.VILAProcessor",
|
| 274 |
"AutoConfig": "modeling_vila.VILAConfig",
|
| 275 |
-
"AutoModel": "modeling_vila.
|
| 276 |
-
"AutoModelForCausalLM": "modeling_vila.
|
| 277 |
}
|
| 278 |
# vila1.5 legacy support
|
| 279 |
config["model_type"] = "vila"
|
|
@@ -501,7 +503,7 @@ class VILAPretrainedModel(PreTrainedModel):
|
|
| 501 |
self.get_mm_projector().eval()
|
| 502 |
|
| 503 |
|
| 504 |
-
class
|
| 505 |
def __init__(self, config: VILAConfig, *args, **kwargs):
|
| 506 |
super().__init__(config, *args, **kwargs)
|
| 507 |
|
|
@@ -1082,7 +1084,8 @@ class VILAForCasualLM(VILAPretrainedModel):
|
|
| 1082 |
|
| 1083 |
return outputs
|
| 1084 |
|
| 1085 |
-
|
|
|
|
| 1086 |
def generate(
|
| 1087 |
self,
|
| 1088 |
input_ids: Optional[torch.FloatTensor] = None,
|
|
@@ -1100,14 +1103,13 @@ class VILAForCasualLM(VILAPretrainedModel):
|
|
| 1100 |
input_emds: <media emd> 001 002 003 004
|
| 1101 |
"""
|
| 1102 |
# NOTE: hard code to move to GPU
|
| 1103 |
-
input_ids = input_ids.cuda()
|
| 1104 |
-
media = {k: [v.cuda() for v in media[k]] for k in media}
|
| 1105 |
-
if attention_mask is not None:
|
| 1106 |
-
|
| 1107 |
-
|
| 1108 |
inputs_embeds, _, attention_mask = self._embed(input_ids, media, media_config, None, attention_mask)
|
| 1109 |
output_ids = self.llm.generate(inputs_embeds=inputs_embeds, attention_mask=attention_mask, **generation_kwargs)
|
| 1110 |
-
|
| 1111 |
if return_output_ids_only:
|
| 1112 |
return_value = output_ids
|
| 1113 |
else:
|
|
|
|
| 59 |
# ease debugging
|
| 60 |
python_input = input
|
| 61 |
|
| 62 |
+
|
| 63 |
# quick hack for remote code
|
| 64 |
def get_pg_manager():
|
| 65 |
return None
|
|
|
|
| 192 |
main_input_name = "input_embeds"
|
| 193 |
supports_gradient_checkpointing = True
|
| 194 |
_supports_flash_attn_2 = True
|
| 195 |
+
_no_split_modules = ["Qwen2DecoderLayer", "SiglipEncoderLayer"]
|
| 196 |
|
| 197 |
def __init__(self, config: VILAConfig, *args, **kwargs):
|
| 198 |
super().__init__(config)
|
|
|
|
| 270 |
cfg_path = os.path.join(output_dir, "config.json")
|
| 271 |
config = json.load(open(cfg_path))
|
| 272 |
config["version"] = "2.0" # nvila tag
|
| 273 |
+
config["architectures"] = ["VILAForCausalLM"]
|
| 274 |
config["auto_map"] = {
|
| 275 |
"AutoProcessor": "auto_processor.VILAProcessor",
|
| 276 |
"AutoConfig": "modeling_vila.VILAConfig",
|
| 277 |
+
"AutoModel": "modeling_vila.VILAForCausalLM",
|
| 278 |
+
"AutoModelForCausalLM": "modeling_vila.VILAForCausalLM",
|
| 279 |
}
|
| 280 |
# vila1.5 legacy support
|
| 281 |
config["model_type"] = "vila"
|
|
|
|
| 503 |
self.get_mm_projector().eval()
|
| 504 |
|
| 505 |
|
| 506 |
+
class VILAForCausalLM(VILAPretrainedModel):
|
| 507 |
def __init__(self, config: VILAConfig, *args, **kwargs):
|
| 508 |
super().__init__(config, *args, **kwargs)
|
| 509 |
|
|
|
|
| 1084 |
|
| 1085 |
return outputs
|
| 1086 |
|
| 1087 |
+
# TODO(ligeng): check how qwen implements this function
|
| 1088 |
+
# @torch.inference_mode()
|
| 1089 |
def generate(
|
| 1090 |
self,
|
| 1091 |
input_ids: Optional[torch.FloatTensor] = None,
|
|
|
|
| 1103 |
input_emds: <media emd> 001 002 003 004
|
| 1104 |
"""
|
| 1105 |
# NOTE: hard code to move to GPU
|
| 1106 |
+
# input_ids = input_ids.cuda()
|
| 1107 |
+
# media = {k: [v.cuda() if v is not None for v in media[k]] for k in media}
|
| 1108 |
+
# if attention_mask is not None:
|
| 1109 |
+
# attention_mask = attention_mask.cuda()
|
|
|
|
| 1110 |
inputs_embeds, _, attention_mask = self._embed(input_ids, media, media_config, None, attention_mask)
|
| 1111 |
output_ids = self.llm.generate(inputs_embeds=inputs_embeds, attention_mask=attention_mask, **generation_kwargs)
|
| 1112 |
+
|
| 1113 |
if return_output_ids_only:
|
| 1114 |
return_value = output_ids
|
| 1115 |
else:
|