from typing import Any, Dict, Optional from transformers.configuration_utils import PretrainedConfig from transformers.models.qwen2.configuration_qwen2 import Qwen2Config from transformers.models.siglip.configuration_siglip import SiglipVisionConfig class VILAConfig(PretrainedConfig): # Class attributes. model_type: str = "vila" sub_configs: Dict[str, PretrainedConfig] = { "text_config": Qwen2Config(), "vision_config": SiglipVisionConfig(), } _auto_class: Optional[str] = "AutoConfig" # Configuration for sub-modules. text_config: Qwen2Config = Qwen2Config() vision_config: SiglipVisionConfig = SiglipVisionConfig() # Model configuration. hidden_size: int image_token_id: int mm_hidden_size: int mm_projector_type: str mm_vision_select_feature: str mm_vision_select_layer: int video_token_id: int def __init__( self, text_config: Optional[Dict[str, Any]] = None, vision_config: Optional[Dict[str, Any]] = None, *, hidden_size: int = 1536, image_token_id: int = 151649, mm_hidden_size: int = 1152, mm_projector_type: str = "mlp_downsample_3x3_fix", mm_vision_select_feature: str = "cls_patch", mm_vision_select_layer: int = -2, video_token_id: int = 151650, **kwargs, ): super().__init__(**kwargs) self.text_config = Qwen2Config(**text_config) if text_config else Qwen2Config() self.vision_config = SiglipVisionConfig(**vision_config) if vision_config else SiglipVisionConfig() self.hidden_size = hidden_size self.image_token_id = image_token_id self.mm_hidden_size = mm_hidden_size self.mm_projector_type = mm_projector_type self.mm_vision_select_feature = mm_vision_select_feature self.mm_vision_select_layer = mm_vision_select_layer self.video_token_id = video_token_id