|
|
from transformers.configuration_utils import PretrainedConfig |
|
|
|
|
|
|
|
|
class MoonViTConfig(PretrainedConfig): |
|
|
model_type = "moonvit" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
patch_size: int = 14, |
|
|
init_pos_emb_height: int = 64, |
|
|
init_pos_emb_width: int = 64, |
|
|
num_attention_heads: int = 16, |
|
|
num_hidden_layers: int = 27, |
|
|
hidden_size: int = 1152, |
|
|
text_hidden_size: int = 2048, |
|
|
intermediate_size: int = 4304, |
|
|
merge_kernel_size: tuple[int, int] = (2, 2), |
|
|
rope_theta: float = 1000000.0, |
|
|
max_position_embeddings: int = 128000, |
|
|
rope_scaling: dict = {'type': 'default', 'mrope_section': [12, 12, 12], 'rope_type': 'default'}, |
|
|
**kwargs, |
|
|
): |
|
|
super().__init__(**kwargs) |
|
|
self.patch_size = patch_size |
|
|
|
|
|
self.init_pos_emb_height = init_pos_emb_height |
|
|
self.init_pos_emb_width = init_pos_emb_width |
|
|
|
|
|
self.num_hidden_layers = num_hidden_layers |
|
|
self.num_attention_heads = num_attention_heads |
|
|
self.hidden_size = hidden_size |
|
|
self.text_hidden_size = text_hidden_size |
|
|
self.intermediate_size = intermediate_size |
|
|
|
|
|
self.merge_kernel_size = merge_kernel_size |
|
|
|
|
|
self.rope_theta = rope_theta |
|
|
self.max_position_embeddings = max_position_embeddings |
|
|
self.rope_scaling = rope_scaling |
|
|
|