{ "hidden_size": 768, "num_heads": 12, "expansion": 2.5, "vocab_size": 50257, "max_text_len": 77, "frames": 16, "height": 64, "width": 64, "max_steps": 4, "patch_size": [ 2, 8, 8 ], "text_encoder_layers": 8, "hrm_blocks": 3, "video_seq_len": 512 }