amd
/

Safetensors
llama
alignment-handbook
Generated from Trainer
Zebra-Llama-1B-4MLA-12Mamba-SFT / hybrid_config.json
Mingyuyang-1's picture
Update hybrid_config.json
7751579 verified
raw
history blame contribute delete
930 Bytes
{
"hidden_size": 2048,
"intermediate_size": 8192,
"hidden_act": "silu",
"n_layer": 16,
"mla_layers": [
0,
5,
10,
14
],
"rms_norm_eps": 1e-05,
"num_attention_heads": 32,
"num_key_value_heads": 8,
"kv_lora_rank": 128,
"q_lora_rank": 1344,
"use_lora_layer_norm": false,
"use_full_kv_head": false,
"qk_rope_head_dim": 32,
"v_head_dim": 64,
"qk_nope_head_dim": 32,
"qkv_rank_divisor": 8,
"max_position_embeddings": 131072,
"rope_theta": 500000.0,
"rope_scaling": {
"factor": 32.0,
"original_max_position_embeddings": 8192,
"rope_type": "yarn"
},
"attention_bias": false,
"attention_dropout": 0.0,
"d_model": 2048,
"ssm_cfg": {
"expand": 1,
"ngroups": 32,
"d_state": 64,
"repeat_kv_before_conv": false
},
"d_inner": 2048,
"d_xb": 512
}