{ "architectures": [ "Step3vForConditionalGeneration" ], "auto_map": { "AutoConfig": "stepfun-ai/step3--configuration_step3.Step3VLConfig", "AutoModelForCausalLM": "stepfun-ai/step3--modeling_step3.Step3vForConditionalGeneration" }, "bos_token_id": 0, "eos_token_id": 128805, "hidden_size": 32, "im_end_token": "", "im_patch_token": "", "im_start_token": "", "image_token_id": 128001, "image_token_len": 169, "model_type": "step3_vl", "patch_token_len": 81, "projector_bias": false, "text_config": { "architectures": [ "Step3TextForCausalLM" ], "head_dim": 256, "hidden_size": 32, "intermediate_size": 64, "max_position_embedding": 65536, "max_seq_len": 65536, "model_type": "step3_text", "moe_intermediate_size": 64, "moe_layers_enum": "1", "moe_num_experts": 8, "moe_top_k": 3, "norm_expert_weight": false, "num_attention_groups": 1, "num_attention_heads": 2, "num_hidden_layers": 2, "rms_norm_eps": 1e-05, "rope_scaling": null, "rope_theta": 500000, "share_expert_dim": 64, "share_q_dim": 512, "torch_dtype": "bfloat16", "vocab_size": 128815 }, "torch_dtype": "bfloat16", "transformers_version": "4.54.1", "understand_projector_stride": 2, "vision_config": { "hidden_act": "quick_gelu", "hidden_size": 64, "image_size": 728, "intermediate_size": 128, "layer_norm_eps": 1e-05, "model_type": "step3_vision_encoder", "num_attention_heads": 2, "num_channels": 3, "num_hidden_layers": 2, "output_hidden_size": 64, "patch_size": 14 } }