LFM2-8B-A1B-AWQ-4bit / recipe.yaml
cpatonn's picture
Upload folder using huggingface_hub
0c11e38 verified
default_stage:
default_modifiers:
AWQModifier:
config_groups:
group_0:
targets: [Linear]
weights:
num_bits: 4
type: int
symmetric: true
group_size: 32
strategy: group
block_structure: null
dynamic: false
actorder: null
observer: mse
observer_kwargs: {}
input_activations: null
output_activations: null
format: null
targets: [Linear]
ignore: [model.embed_tokens, model.embedding_norm, 're:model[.]layers[.]0[.].*', 're:model[.]layers[.]1[.].*',
're:.*feed_forward[.]gate$', 're:.*ffn_norm$', 're:.*operator_norm$', 're:.*self_attn.*',
're:.*conv.*', lm_head]
mappings:
- smooth_layer: model.layers.0.operator_norm
balance_layers: [model.layers.0.conv.in_proj]
- smooth_layer: model.layers.0.ffn_norm
balance_layers: [model.layers.0.feed_forward.w1, model.layers.0.feed_forward.w3]
- smooth_layer: model.layers.1.operator_norm
balance_layers: [model.layers.1.conv.in_proj]
- smooth_layer: model.layers.1.ffn_norm
balance_layers: [model.layers.1.feed_forward.w1, model.layers.1.feed_forward.w3]
- smooth_layer: model.layers.2.operator_norm
balance_layers: [model.layers.2.self_attn.q_proj, model.layers.2.self_attn.k_proj,
model.layers.2.self_attn.v_proj]
- smooth_layer: model.layers.2.ffn_norm
balance_layers: [model.layers.2.feed_forward.gate, model.layers.2.feed_forward.experts.0.w1,
model.layers.2.feed_forward.experts.0.w3, model.layers.2.feed_forward.experts.1.w1,
model.layers.2.feed_forward.experts.1.w3, model.layers.2.feed_forward.experts.2.w1,
model.layers.2.feed_forward.experts.2.w3, model.layers.2.feed_forward.experts.3.w1,
model.layers.2.feed_forward.experts.3.w3, model.layers.2.feed_forward.experts.4.w1,
model.layers.2.feed_forward.experts.4.w3, model.layers.2.feed_forward.experts.5.w1,
model.layers.2.feed_forward.experts.5.w3, model.layers.2.feed_forward.experts.6.w1,
model.layers.2.feed_forward.experts.6.w3, model.layers.2.feed_forward.experts.7.w1,
model.layers.2.feed_forward.experts.7.w3, model.layers.2.feed_forward.experts.8.w1,
model.layers.2.feed_forward.experts.8.w3, model.layers.2.feed_forward.experts.9.w1,
model.layers.2.feed_forward.experts.9.w3, model.layers.2.feed_forward.experts.10.w1,
model.layers.2.feed_forward.experts.10.w3, model.layers.2.feed_forward.experts.11.w1,
model.layers.2.feed_forward.experts.11.w3, model.layers.2.feed_forward.experts.12.w1,
model.layers.2.feed_forward.experts.12.w3, model.layers.2.feed_forward.experts.13.w1,
model.layers.2.feed_forward.experts.13.w3, model.layers.2.feed_forward.experts.14.w1,
model.layers.2.feed_forward.experts.14.w3, model.layers.2.feed_forward.experts.15.w1,
model.layers.2.feed_forward.experts.15.w3, model.layers.2.feed_forward.experts.16.w1,
model.layers.2.feed_forward.experts.16.w3, model.layers.2.feed_forward.experts.17.w1,
model.layers.2.feed_forward.experts.17.w3, model.layers.2.feed_forward.experts.18.w1,
model.layers.2.feed_forward.experts.18.w3, model.layers.2.feed_forward.experts.19.w1,
model.layers.2.feed_forward.experts.19.w3, model.layers.2.feed_forward.experts.20.w1,
model.layers.2.feed_forward.experts.20.w3, model.layers.2.feed_forward.experts.21.w1,
model.layers.2.feed_forward.experts.21.w3, model.layers.2.feed_forward.experts.22.w1,
model.layers.2.feed_forward.experts.22.w3, model.layers.2.feed_forward.experts.23.w1,
model.layers.2.feed_forward.experts.23.w3, model.layers.2.feed_forward.experts.24.w1,
model.layers.2.feed_forward.experts.24.w3, model.layers.2.feed_forward.experts.25.w1,
model.layers.2.feed_forward.experts.25.w3, model.layers.2.feed_forward.experts.26.w1,
model.layers.2.feed_forward.experts.26.w3, model.layers.2.feed_forward.experts.27.w1,
model.layers.2.feed_forward.experts.27.w3, model.layers.2.feed_forward.experts.28.w1,
model.layers.2.feed_forward.experts.28.w3, model.layers.2.feed_forward.experts.29.w1,
model.layers.2.feed_forward.experts.29.w3, model.layers.2.feed_forward.experts.30.w1,
model.layers.2.feed_forward.experts.30.w3, model.layers.2.feed_forward.experts.31.w1,
model.layers.2.feed_forward.experts.31.w3]
- smooth_layer: model.layers.3.operator_norm
balance_layers: [model.layers.3.conv.in_proj]
- smooth_layer: model.layers.3.ffn_norm
balance_layers: [model.layers.3.feed_forward.gate, model.layers.3.feed_forward.experts.0.w1,
model.layers.3.feed_forward.experts.0.w3, model.layers.3.feed_forward.experts.1.w1,
model.layers.3.feed_forward.experts.1.w3, model.layers.3.feed_forward.experts.2.w1,
model.layers.3.feed_forward.experts.2.w3, model.layers.3.feed_forward.experts.3.w1,
model.layers.3.feed_forward.experts.3.w3, model.layers.3.feed_forward.experts.4.w1,
model.layers.3.feed_forward.experts.4.w3, model.layers.3.feed_forward.experts.5.w1,
model.layers.3.feed_forward.experts.5.w3, model.layers.3.feed_forward.experts.6.w1,
model.layers.3.feed_forward.experts.6.w3, model.layers.3.feed_forward.experts.7.w1,
model.layers.3.feed_forward.experts.7.w3, model.layers.3.feed_forward.experts.8.w1,
model.layers.3.feed_forward.experts.8.w3, model.layers.3.feed_forward.experts.9.w1,
model.layers.3.feed_forward.experts.9.w3, model.layers.3.feed_forward.experts.10.w1,
model.layers.3.feed_forward.experts.10.w3, model.layers.3.feed_forward.experts.11.w1,
model.layers.3.feed_forward.experts.11.w3, model.layers.3.feed_forward.experts.12.w1,
model.layers.3.feed_forward.experts.12.w3, model.layers.3.feed_forward.experts.13.w1,
model.layers.3.feed_forward.experts.13.w3, model.layers.3.feed_forward.experts.14.w1,
model.layers.3.feed_forward.experts.14.w3, model.layers.3.feed_forward.experts.15.w1,
model.layers.3.feed_forward.experts.15.w3, model.layers.3.feed_forward.experts.16.w1,
model.layers.3.feed_forward.experts.16.w3, model.layers.3.feed_forward.experts.17.w1,
model.layers.3.feed_forward.experts.17.w3, model.layers.3.feed_forward.experts.18.w1,
model.layers.3.feed_forward.experts.18.w3, model.layers.3.feed_forward.experts.19.w1,
model.layers.3.feed_forward.experts.19.w3, model.layers.3.feed_forward.experts.20.w1,
model.layers.3.feed_forward.experts.20.w3, model.layers.3.feed_forward.experts.21.w1,
model.layers.3.feed_forward.experts.21.w3, model.layers.3.feed_forward.experts.22.w1,
model.layers.3.feed_forward.experts.22.w3, model.layers.3.feed_forward.experts.23.w1,
model.layers.3.feed_forward.experts.23.w3, model.layers.3.feed_forward.experts.24.w1,
model.layers.3.feed_forward.experts.24.w3, model.layers.3.feed_forward.experts.25.w1,
model.layers.3.feed_forward.experts.25.w3, model.layers.3.feed_forward.experts.26.w1,
model.layers.3.feed_forward.experts.26.w3, model.layers.3.feed_forward.experts.27.w1,
model.layers.3.feed_forward.experts.27.w3, model.layers.3.feed_forward.experts.28.w1,
model.layers.3.feed_forward.experts.28.w3, model.layers.3.feed_forward.experts.29.w1,
model.layers.3.feed_forward.experts.29.w3, model.layers.3.feed_forward.experts.30.w1,
model.layers.3.feed_forward.experts.30.w3, model.layers.3.feed_forward.experts.31.w1,
model.layers.3.feed_forward.experts.31.w3]
- smooth_layer: model.layers.4.operator_norm
balance_layers: [model.layers.4.conv.in_proj]
- smooth_layer: model.layers.4.ffn_norm
balance_layers: [model.layers.4.feed_forward.gate, model.layers.4.feed_forward.experts.0.w1,
model.layers.4.feed_forward.experts.0.w3, model.layers.4.feed_forward.experts.1.w1,
model.layers.4.feed_forward.experts.1.w3, model.layers.4.feed_forward.experts.2.w1,
model.layers.4.feed_forward.experts.2.w3, model.layers.4.feed_forward.experts.3.w1,
model.layers.4.feed_forward.experts.3.w3, model.layers.4.feed_forward.experts.4.w1,
model.layers.4.feed_forward.experts.4.w3, model.layers.4.feed_forward.experts.5.w1,
model.layers.4.feed_forward.experts.5.w3, model.layers.4.feed_forward.experts.6.w1,
model.layers.4.feed_forward.experts.6.w3, model.layers.4.feed_forward.experts.7.w1,
model.layers.4.feed_forward.experts.7.w3, model.layers.4.feed_forward.experts.8.w1,
model.layers.4.feed_forward.experts.8.w3, model.layers.4.feed_forward.experts.9.w1,
model.layers.4.feed_forward.experts.9.w3, model.layers.4.feed_forward.experts.10.w1,
model.layers.4.feed_forward.experts.10.w3, model.layers.4.feed_forward.experts.11.w1,
model.layers.4.feed_forward.experts.11.w3, model.layers.4.feed_forward.experts.12.w1,
model.layers.4.feed_forward.experts.12.w3, model.layers.4.feed_forward.experts.13.w1,
model.layers.4.feed_forward.experts.13.w3, model.layers.4.feed_forward.experts.14.w1,
model.layers.4.feed_forward.experts.14.w3, model.layers.4.feed_forward.experts.15.w1,
model.layers.4.feed_forward.experts.15.w3, model.layers.4.feed_forward.experts.16.w1,
model.layers.4.feed_forward.experts.16.w3, model.layers.4.feed_forward.experts.17.w1,
model.layers.4.feed_forward.experts.17.w3, model.layers.4.feed_forward.experts.18.w1,
model.layers.4.feed_forward.experts.18.w3, model.layers.4.feed_forward.experts.19.w1,
model.layers.4.feed_forward.experts.19.w3, model.layers.4.feed_forward.experts.20.w1,
model.layers.4.feed_forward.experts.20.w3, model.layers.4.feed_forward.experts.21.w1,
model.layers.4.feed_forward.experts.21.w3, model.layers.4.feed_forward.experts.22.w1,
model.layers.4.feed_forward.experts.22.w3, model.layers.4.feed_forward.experts.23.w1,
model.layers.4.feed_forward.experts.23.w3, model.layers.4.feed_forward.experts.24.w1,
model.layers.4.feed_forward.experts.24.w3, model.layers.4.feed_forward.experts.25.w1,
model.layers.4.feed_forward.experts.25.w3, model.layers.4.feed_forward.experts.26.w1,
model.layers.4.feed_forward.experts.26.w3, model.layers.4.feed_forward.experts.27.w1,
model.layers.4.feed_forward.experts.27.w3, model.layers.4.feed_forward.experts.28.w1,
model.layers.4.feed_forward.experts.28.w3, model.layers.4.feed_forward.experts.29.w1,
model.layers.4.feed_forward.experts.29.w3, model.layers.4.feed_forward.experts.30.w1,
model.layers.4.feed_forward.experts.30.w3, model.layers.4.feed_forward.experts.31.w1,
model.layers.4.feed_forward.experts.31.w3]
- smooth_layer: model.layers.5.operator_norm
balance_layers: [model.layers.5.conv.in_proj]
- smooth_layer: model.layers.5.ffn_norm
balance_layers: [model.layers.5.feed_forward.gate, model.layers.5.feed_forward.experts.0.w1,
model.layers.5.feed_forward.experts.0.w3, model.layers.5.feed_forward.experts.1.w1,
model.layers.5.feed_forward.experts.1.w3, model.layers.5.feed_forward.experts.2.w1,
model.layers.5.feed_forward.experts.2.w3, model.layers.5.feed_forward.experts.3.w1,
model.layers.5.feed_forward.experts.3.w3, model.layers.5.feed_forward.experts.4.w1,
model.layers.5.feed_forward.experts.4.w3, model.layers.5.feed_forward.experts.5.w1,
model.layers.5.feed_forward.experts.5.w3, model.layers.5.feed_forward.experts.6.w1,
model.layers.5.feed_forward.experts.6.w3, model.layers.5.feed_forward.experts.7.w1,
model.layers.5.feed_forward.experts.7.w3, model.layers.5.feed_forward.experts.8.w1,
model.layers.5.feed_forward.experts.8.w3, model.layers.5.feed_forward.experts.9.w1,
model.layers.5.feed_forward.experts.9.w3, model.layers.5.feed_forward.experts.10.w1,
model.layers.5.feed_forward.experts.10.w3, model.layers.5.feed_forward.experts.11.w1,
model.layers.5.feed_forward.experts.11.w3, model.layers.5.feed_forward.experts.12.w1,
model.layers.5.feed_forward.experts.12.w3, model.layers.5.feed_forward.experts.13.w1,
model.layers.5.feed_forward.experts.13.w3, model.layers.5.feed_forward.experts.14.w1,
model.layers.5.feed_forward.experts.14.w3, model.layers.5.feed_forward.experts.15.w1,
model.layers.5.feed_forward.experts.15.w3, model.layers.5.feed_forward.experts.16.w1,
model.layers.5.feed_forward.experts.16.w3, model.layers.5.feed_forward.experts.17.w1,
model.layers.5.feed_forward.experts.17.w3, model.layers.5.feed_forward.experts.18.w1,
model.layers.5.feed_forward.experts.18.w3, model.layers.5.feed_forward.experts.19.w1,
model.layers.5.feed_forward.experts.19.w3, model.layers.5.feed_forward.experts.20.w1,
model.layers.5.feed_forward.experts.20.w3, model.layers.5.feed_forward.experts.21.w1,
model.layers.5.feed_forward.experts.21.w3, model.layers.5.feed_forward.experts.22.w1,
model.layers.5.feed_forward.experts.22.w3, model.layers.5.feed_forward.experts.23.w1,
model.layers.5.feed_forward.experts.23.w3, model.layers.5.feed_forward.experts.24.w1,
model.layers.5.feed_forward.experts.24.w3, model.layers.5.feed_forward.experts.25.w1,
model.layers.5.feed_forward.experts.25.w3, model.layers.5.feed_forward.experts.26.w1,
model.layers.5.feed_forward.experts.26.w3, model.layers.5.feed_forward.experts.27.w1,
model.layers.5.feed_forward.experts.27.w3, model.layers.5.feed_forward.experts.28.w1,
model.layers.5.feed_forward.experts.28.w3, model.layers.5.feed_forward.experts.29.w1,
model.layers.5.feed_forward.experts.29.w3, model.layers.5.feed_forward.experts.30.w1,
model.layers.5.feed_forward.experts.30.w3, model.layers.5.feed_forward.experts.31.w1,
model.layers.5.feed_forward.experts.31.w3]
- smooth_layer: model.layers.6.operator_norm
balance_layers: [model.layers.6.self_attn.q_proj, model.layers.6.self_attn.k_proj,
model.layers.6.self_attn.v_proj]
- smooth_layer: model.layers.6.ffn_norm
balance_layers: [model.layers.6.feed_forward.gate, model.layers.6.feed_forward.experts.0.w1,
model.layers.6.feed_forward.experts.0.w3, model.layers.6.feed_forward.experts.1.w1,
model.layers.6.feed_forward.experts.1.w3, model.layers.6.feed_forward.experts.2.w1,
model.layers.6.feed_forward.experts.2.w3, model.layers.6.feed_forward.experts.3.w1,
model.layers.6.feed_forward.experts.3.w3, model.layers.6.feed_forward.experts.4.w1,
model.layers.6.feed_forward.experts.4.w3, model.layers.6.feed_forward.experts.5.w1,
model.layers.6.feed_forward.experts.5.w3, model.layers.6.feed_forward.experts.6.w1,
model.layers.6.feed_forward.experts.6.w3, model.layers.6.feed_forward.experts.7.w1,
model.layers.6.feed_forward.experts.7.w3, model.layers.6.feed_forward.experts.8.w1,
model.layers.6.feed_forward.experts.8.w3, model.layers.6.feed_forward.experts.9.w1,
model.layers.6.feed_forward.experts.9.w3, model.layers.6.feed_forward.experts.10.w1,
model.layers.6.feed_forward.experts.10.w3, model.layers.6.feed_forward.experts.11.w1,
model.layers.6.feed_forward.experts.11.w3, model.layers.6.feed_forward.experts.12.w1,
model.layers.6.feed_forward.experts.12.w3, model.layers.6.feed_forward.experts.13.w1,
model.layers.6.feed_forward.experts.13.w3, model.layers.6.feed_forward.experts.14.w1,
model.layers.6.feed_forward.experts.14.w3, model.layers.6.feed_forward.experts.15.w1,
model.layers.6.feed_forward.experts.15.w3, model.layers.6.feed_forward.experts.16.w1,
model.layers.6.feed_forward.experts.16.w3, model.layers.6.feed_forward.experts.17.w1,
model.layers.6.feed_forward.experts.17.w3, model.layers.6.feed_forward.experts.18.w1,
model.layers.6.feed_forward.experts.18.w3, model.layers.6.feed_forward.experts.19.w1,
model.layers.6.feed_forward.experts.19.w3, model.layers.6.feed_forward.experts.20.w1,
model.layers.6.feed_forward.experts.20.w3, model.layers.6.feed_forward.experts.21.w1,
model.layers.6.feed_forward.experts.21.w3, model.layers.6.feed_forward.experts.22.w1,
model.layers.6.feed_forward.experts.22.w3, model.layers.6.feed_forward.experts.23.w1,
model.layers.6.feed_forward.experts.23.w3, model.layers.6.feed_forward.experts.24.w1,
model.layers.6.feed_forward.experts.24.w3, model.layers.6.feed_forward.experts.25.w1,
model.layers.6.feed_forward.experts.25.w3, model.layers.6.feed_forward.experts.26.w1,
model.layers.6.feed_forward.experts.26.w3, model.layers.6.feed_forward.experts.27.w1,
model.layers.6.feed_forward.experts.27.w3, model.layers.6.feed_forward.experts.28.w1,
model.layers.6.feed_forward.experts.28.w3, model.layers.6.feed_forward.experts.29.w1,
model.layers.6.feed_forward.experts.29.w3, model.layers.6.feed_forward.experts.30.w1,
model.layers.6.feed_forward.experts.30.w3, model.layers.6.feed_forward.experts.31.w1,
model.layers.6.feed_forward.experts.31.w3]
- smooth_layer: model.layers.7.operator_norm
balance_layers: [model.layers.7.conv.in_proj]
- smooth_layer: model.layers.7.ffn_norm
balance_layers: [model.layers.7.feed_forward.gate, model.layers.7.feed_forward.experts.0.w1,
model.layers.7.feed_forward.experts.0.w3, model.layers.7.feed_forward.experts.1.w1,
model.layers.7.feed_forward.experts.1.w3, model.layers.7.feed_forward.experts.2.w1,
model.layers.7.feed_forward.experts.2.w3, model.layers.7.feed_forward.experts.3.w1,
model.layers.7.feed_forward.experts.3.w3, model.layers.7.feed_forward.experts.4.w1,
model.layers.7.feed_forward.experts.4.w3, model.layers.7.feed_forward.experts.5.w1,
model.layers.7.feed_forward.experts.5.w3, model.layers.7.feed_forward.experts.6.w1,
model.layers.7.feed_forward.experts.6.w3, model.layers.7.feed_forward.experts.7.w1,
model.layers.7.feed_forward.experts.7.w3, model.layers.7.feed_forward.experts.8.w1,
model.layers.7.feed_forward.experts.8.w3, model.layers.7.feed_forward.experts.9.w1,
model.layers.7.feed_forward.experts.9.w3, model.layers.7.feed_forward.experts.10.w1,
model.layers.7.feed_forward.experts.10.w3, model.layers.7.feed_forward.experts.11.w1,
model.layers.7.feed_forward.experts.11.w3, model.layers.7.feed_forward.experts.12.w1,
model.layers.7.feed_forward.experts.12.w3, model.layers.7.feed_forward.experts.13.w1,
model.layers.7.feed_forward.experts.13.w3, model.layers.7.feed_forward.experts.14.w1,
model.layers.7.feed_forward.experts.14.w3, model.layers.7.feed_forward.experts.15.w1,
model.layers.7.feed_forward.experts.15.w3, model.layers.7.feed_forward.experts.16.w1,
model.layers.7.feed_forward.experts.16.w3, model.layers.7.feed_forward.experts.17.w1,
model.layers.7.feed_forward.experts.17.w3, model.layers.7.feed_forward.experts.18.w1,
model.layers.7.feed_forward.experts.18.w3, model.layers.7.feed_forward.experts.19.w1,
model.layers.7.feed_forward.experts.19.w3, model.layers.7.feed_forward.experts.20.w1,
model.layers.7.feed_forward.experts.20.w3, model.layers.7.feed_forward.experts.21.w1,
model.layers.7.feed_forward.experts.21.w3, model.layers.7.feed_forward.experts.22.w1,
model.layers.7.feed_forward.experts.22.w3, model.layers.7.feed_forward.experts.23.w1,
model.layers.7.feed_forward.experts.23.w3, model.layers.7.feed_forward.experts.24.w1,
model.layers.7.feed_forward.experts.24.w3, model.layers.7.feed_forward.experts.25.w1,
model.layers.7.feed_forward.experts.25.w3, model.layers.7.feed_forward.experts.26.w1,
model.layers.7.feed_forward.experts.26.w3, model.layers.7.feed_forward.experts.27.w1,
model.layers.7.feed_forward.experts.27.w3, model.layers.7.feed_forward.experts.28.w1,
model.layers.7.feed_forward.experts.28.w3, model.layers.7.feed_forward.experts.29.w1,
model.layers.7.feed_forward.experts.29.w3, model.layers.7.feed_forward.experts.30.w1,
model.layers.7.feed_forward.experts.30.w3, model.layers.7.feed_forward.experts.31.w1,
model.layers.7.feed_forward.experts.31.w3]
- smooth_layer: model.layers.8.operator_norm
balance_layers: [model.layers.8.conv.in_proj]
- smooth_layer: model.layers.8.ffn_norm
balance_layers: [model.layers.8.feed_forward.gate, model.layers.8.feed_forward.experts.0.w1,
model.layers.8.feed_forward.experts.0.w3, model.layers.8.feed_forward.experts.1.w1,
model.layers.8.feed_forward.experts.1.w3, model.layers.8.feed_forward.experts.2.w1,
model.layers.8.feed_forward.experts.2.w3, model.layers.8.feed_forward.experts.3.w1,
model.layers.8.feed_forward.experts.3.w3, model.layers.8.feed_forward.experts.4.w1,
model.layers.8.feed_forward.experts.4.w3, model.layers.8.feed_forward.experts.5.w1,
model.layers.8.feed_forward.experts.5.w3, model.layers.8.feed_forward.experts.6.w1,
model.layers.8.feed_forward.experts.6.w3, model.layers.8.feed_forward.experts.7.w1,
model.layers.8.feed_forward.experts.7.w3, model.layers.8.feed_forward.experts.8.w1,
model.layers.8.feed_forward.experts.8.w3, model.layers.8.feed_forward.experts.9.w1,
model.layers.8.feed_forward.experts.9.w3, model.layers.8.feed_forward.experts.10.w1,
model.layers.8.feed_forward.experts.10.w3, model.layers.8.feed_forward.experts.11.w1,
model.layers.8.feed_forward.experts.11.w3, model.layers.8.feed_forward.experts.12.w1,
model.layers.8.feed_forward.experts.12.w3, model.layers.8.feed_forward.experts.13.w1,
model.layers.8.feed_forward.experts.13.w3, model.layers.8.feed_forward.experts.14.w1,
model.layers.8.feed_forward.experts.14.w3, model.layers.8.feed_forward.experts.15.w1,
model.layers.8.feed_forward.experts.15.w3, model.layers.8.feed_forward.experts.16.w1,
model.layers.8.feed_forward.experts.16.w3, model.layers.8.feed_forward.experts.17.w1,
model.layers.8.feed_forward.experts.17.w3, model.layers.8.feed_forward.experts.18.w1,
model.layers.8.feed_forward.experts.18.w3, model.layers.8.feed_forward.experts.19.w1,
model.layers.8.feed_forward.experts.19.w3, model.layers.8.feed_forward.experts.20.w1,
model.layers.8.feed_forward.experts.20.w3, model.layers.8.feed_forward.experts.21.w1,
model.layers.8.feed_forward.experts.21.w3, model.layers.8.feed_forward.experts.22.w1,
model.layers.8.feed_forward.experts.22.w3, model.layers.8.feed_forward.experts.23.w1,
model.layers.8.feed_forward.experts.23.w3, model.layers.8.feed_forward.experts.24.w1,
model.layers.8.feed_forward.experts.24.w3, model.layers.8.feed_forward.experts.25.w1,
model.layers.8.feed_forward.experts.25.w3, model.layers.8.feed_forward.experts.26.w1,
model.layers.8.feed_forward.experts.26.w3, model.layers.8.feed_forward.experts.27.w1,
model.layers.8.feed_forward.experts.27.w3, model.layers.8.feed_forward.experts.28.w1,
model.layers.8.feed_forward.experts.28.w3, model.layers.8.feed_forward.experts.29.w1,
model.layers.8.feed_forward.experts.29.w3, model.layers.8.feed_forward.experts.30.w1,
model.layers.8.feed_forward.experts.30.w3, model.layers.8.feed_forward.experts.31.w1,
model.layers.8.feed_forward.experts.31.w3]
- smooth_layer: model.layers.9.operator_norm
balance_layers: [model.layers.9.conv.in_proj]
- smooth_layer: model.layers.9.ffn_norm
balance_layers: [model.layers.9.feed_forward.gate, model.layers.9.feed_forward.experts.0.w1,
model.layers.9.feed_forward.experts.0.w3, model.layers.9.feed_forward.experts.1.w1,
model.layers.9.feed_forward.experts.1.w3, model.layers.9.feed_forward.experts.2.w1,
model.layers.9.feed_forward.experts.2.w3, model.layers.9.feed_forward.experts.3.w1,
model.layers.9.feed_forward.experts.3.w3, model.layers.9.feed_forward.experts.4.w1,
model.layers.9.feed_forward.experts.4.w3, model.layers.9.feed_forward.experts.5.w1,
model.layers.9.feed_forward.experts.5.w3, model.layers.9.feed_forward.experts.6.w1,
model.layers.9.feed_forward.experts.6.w3, model.layers.9.feed_forward.experts.7.w1,
model.layers.9.feed_forward.experts.7.w3, model.layers.9.feed_forward.experts.8.w1,
model.layers.9.feed_forward.experts.8.w3, model.layers.9.feed_forward.experts.9.w1,
model.layers.9.feed_forward.experts.9.w3, model.layers.9.feed_forward.experts.10.w1,
model.layers.9.feed_forward.experts.10.w3, model.layers.9.feed_forward.experts.11.w1,
model.layers.9.feed_forward.experts.11.w3, model.layers.9.feed_forward.experts.12.w1,
model.layers.9.feed_forward.experts.12.w3, model.layers.9.feed_forward.experts.13.w1,
model.layers.9.feed_forward.experts.13.w3, model.layers.9.feed_forward.experts.14.w1,
model.layers.9.feed_forward.experts.14.w3, model.layers.9.feed_forward.experts.15.w1,
model.layers.9.feed_forward.experts.15.w3, model.layers.9.feed_forward.experts.16.w1,
model.layers.9.feed_forward.experts.16.w3, model.layers.9.feed_forward.experts.17.w1,
model.layers.9.feed_forward.experts.17.w3, model.layers.9.feed_forward.experts.18.w1,
model.layers.9.feed_forward.experts.18.w3, model.layers.9.feed_forward.experts.19.w1,
model.layers.9.feed_forward.experts.19.w3, model.layers.9.feed_forward.experts.20.w1,
model.layers.9.feed_forward.experts.20.w3, model.layers.9.feed_forward.experts.21.w1,
model.layers.9.feed_forward.experts.21.w3, model.layers.9.feed_forward.experts.22.w1,
model.layers.9.feed_forward.experts.22.w3, model.layers.9.feed_forward.experts.23.w1,
model.layers.9.feed_forward.experts.23.w3, model.layers.9.feed_forward.experts.24.w1,
model.layers.9.feed_forward.experts.24.w3, model.layers.9.feed_forward.experts.25.w1,
model.layers.9.feed_forward.experts.25.w3, model.layers.9.feed_forward.experts.26.w1,
model.layers.9.feed_forward.experts.26.w3, model.layers.9.feed_forward.experts.27.w1,
model.layers.9.feed_forward.experts.27.w3, model.layers.9.feed_forward.experts.28.w1,
model.layers.9.feed_forward.experts.28.w3, model.layers.9.feed_forward.experts.29.w1,
model.layers.9.feed_forward.experts.29.w3, model.layers.9.feed_forward.experts.30.w1,
model.layers.9.feed_forward.experts.30.w3, model.layers.9.feed_forward.experts.31.w1,
model.layers.9.feed_forward.experts.31.w3]
- smooth_layer: model.layers.10.operator_norm
balance_layers: [model.layers.10.self_attn.q_proj, model.layers.10.self_attn.k_proj,
model.layers.10.self_attn.v_proj]
- smooth_layer: model.layers.10.ffn_norm
balance_layers: [model.layers.10.feed_forward.gate, model.layers.10.feed_forward.experts.0.w1,
model.layers.10.feed_forward.experts.0.w3, model.layers.10.feed_forward.experts.1.w1,
model.layers.10.feed_forward.experts.1.w3, model.layers.10.feed_forward.experts.2.w1,
model.layers.10.feed_forward.experts.2.w3, model.layers.10.feed_forward.experts.3.w1,
model.layers.10.feed_forward.experts.3.w3, model.layers.10.feed_forward.experts.4.w1,
model.layers.10.feed_forward.experts.4.w3, model.layers.10.feed_forward.experts.5.w1,
model.layers.10.feed_forward.experts.5.w3, model.layers.10.feed_forward.experts.6.w1,
model.layers.10.feed_forward.experts.6.w3, model.layers.10.feed_forward.experts.7.w1,
model.layers.10.feed_forward.experts.7.w3, model.layers.10.feed_forward.experts.8.w1,
model.layers.10.feed_forward.experts.8.w3, model.layers.10.feed_forward.experts.9.w1,
model.layers.10.feed_forward.experts.9.w3, model.layers.10.feed_forward.experts.10.w1,
model.layers.10.feed_forward.experts.10.w3, model.layers.10.feed_forward.experts.11.w1,
model.layers.10.feed_forward.experts.11.w3, model.layers.10.feed_forward.experts.12.w1,
model.layers.10.feed_forward.experts.12.w3, model.layers.10.feed_forward.experts.13.w1,
model.layers.10.feed_forward.experts.13.w3, model.layers.10.feed_forward.experts.14.w1,
model.layers.10.feed_forward.experts.14.w3, model.layers.10.feed_forward.experts.15.w1,
model.layers.10.feed_forward.experts.15.w3, model.layers.10.feed_forward.experts.16.w1,
model.layers.10.feed_forward.experts.16.w3, model.layers.10.feed_forward.experts.17.w1,
model.layers.10.feed_forward.experts.17.w3, model.layers.10.feed_forward.experts.18.w1,
model.layers.10.feed_forward.experts.18.w3, model.layers.10.feed_forward.experts.19.w1,
model.layers.10.feed_forward.experts.19.w3, model.layers.10.feed_forward.experts.20.w1,
model.layers.10.feed_forward.experts.20.w3, model.layers.10.feed_forward.experts.21.w1,
model.layers.10.feed_forward.experts.21.w3, model.layers.10.feed_forward.experts.22.w1,
model.layers.10.feed_forward.experts.22.w3, model.layers.10.feed_forward.experts.23.w1,
model.layers.10.feed_forward.experts.23.w3, model.layers.10.feed_forward.experts.24.w1,
model.layers.10.feed_forward.experts.24.w3, model.layers.10.feed_forward.experts.25.w1,
model.layers.10.feed_forward.experts.25.w3, model.layers.10.feed_forward.experts.26.w1,
model.layers.10.feed_forward.experts.26.w3, model.layers.10.feed_forward.experts.27.w1,
model.layers.10.feed_forward.experts.27.w3, model.layers.10.feed_forward.experts.28.w1,
model.layers.10.feed_forward.experts.28.w3, model.layers.10.feed_forward.experts.29.w1,
model.layers.10.feed_forward.experts.29.w3, model.layers.10.feed_forward.experts.30.w1,
model.layers.10.feed_forward.experts.30.w3, model.layers.10.feed_forward.experts.31.w1,
model.layers.10.feed_forward.experts.31.w3]
- smooth_layer: model.layers.11.operator_norm
balance_layers: [model.layers.11.conv.in_proj]
- smooth_layer: model.layers.11.ffn_norm
balance_layers: [model.layers.11.feed_forward.gate, model.layers.11.feed_forward.experts.0.w1,
model.layers.11.feed_forward.experts.0.w3, model.layers.11.feed_forward.experts.1.w1,
model.layers.11.feed_forward.experts.1.w3, model.layers.11.feed_forward.experts.2.w1,
model.layers.11.feed_forward.experts.2.w3, model.layers.11.feed_forward.experts.3.w1,
model.layers.11.feed_forward.experts.3.w3, model.layers.11.feed_forward.experts.4.w1,
model.layers.11.feed_forward.experts.4.w3, model.layers.11.feed_forward.experts.5.w1,
model.layers.11.feed_forward.experts.5.w3, model.layers.11.feed_forward.experts.6.w1,
model.layers.11.feed_forward.experts.6.w3, model.layers.11.feed_forward.experts.7.w1,
model.layers.11.feed_forward.experts.7.w3, model.layers.11.feed_forward.experts.8.w1,
model.layers.11.feed_forward.experts.8.w3, model.layers.11.feed_forward.experts.9.w1,
model.layers.11.feed_forward.experts.9.w3, model.layers.11.feed_forward.experts.10.w1,
model.layers.11.feed_forward.experts.10.w3, model.layers.11.feed_forward.experts.11.w1,
model.layers.11.feed_forward.experts.11.w3, model.layers.11.feed_forward.experts.12.w1,
model.layers.11.feed_forward.experts.12.w3, model.layers.11.feed_forward.experts.13.w1,
model.layers.11.feed_forward.experts.13.w3, model.layers.11.feed_forward.experts.14.w1,
model.layers.11.feed_forward.experts.14.w3, model.layers.11.feed_forward.experts.15.w1,
model.layers.11.feed_forward.experts.15.w3, model.layers.11.feed_forward.experts.16.w1,
model.layers.11.feed_forward.experts.16.w3, model.layers.11.feed_forward.experts.17.w1,
model.layers.11.feed_forward.experts.17.w3, model.layers.11.feed_forward.experts.18.w1,
model.layers.11.feed_forward.experts.18.w3, model.layers.11.feed_forward.experts.19.w1,
model.layers.11.feed_forward.experts.19.w3, model.layers.11.feed_forward.experts.20.w1,
model.layers.11.feed_forward.experts.20.w3, model.layers.11.feed_forward.experts.21.w1,
model.layers.11.feed_forward.experts.21.w3, model.layers.11.feed_forward.experts.22.w1,
model.layers.11.feed_forward.experts.22.w3, model.layers.11.feed_forward.experts.23.w1,
model.layers.11.feed_forward.experts.23.w3, model.layers.11.feed_forward.experts.24.w1,
model.layers.11.feed_forward.experts.24.w3, model.layers.11.feed_forward.experts.25.w1,
model.layers.11.feed_forward.experts.25.w3, model.layers.11.feed_forward.experts.26.w1,
model.layers.11.feed_forward.experts.26.w3, model.layers.11.feed_forward.experts.27.w1,
model.layers.11.feed_forward.experts.27.w3, model.layers.11.feed_forward.experts.28.w1,
model.layers.11.feed_forward.experts.28.w3, model.layers.11.feed_forward.experts.29.w1,
model.layers.11.feed_forward.experts.29.w3, model.layers.11.feed_forward.experts.30.w1,
model.layers.11.feed_forward.experts.30.w3, model.layers.11.feed_forward.experts.31.w1,
model.layers.11.feed_forward.experts.31.w3]
- smooth_layer: model.layers.12.operator_norm
balance_layers: [model.layers.12.conv.in_proj]
- smooth_layer: model.layers.12.ffn_norm
balance_layers: [model.layers.12.feed_forward.gate, model.layers.12.feed_forward.experts.0.w1,
model.layers.12.feed_forward.experts.0.w3, model.layers.12.feed_forward.experts.1.w1,
model.layers.12.feed_forward.experts.1.w3, model.layers.12.feed_forward.experts.2.w1,
model.layers.12.feed_forward.experts.2.w3, model.layers.12.feed_forward.experts.3.w1,
model.layers.12.feed_forward.experts.3.w3, model.layers.12.feed_forward.experts.4.w1,
model.layers.12.feed_forward.experts.4.w3, model.layers.12.feed_forward.experts.5.w1,
model.layers.12.feed_forward.experts.5.w3, model.layers.12.feed_forward.experts.6.w1,
model.layers.12.feed_forward.experts.6.w3, model.layers.12.feed_forward.experts.7.w1,
model.layers.12.feed_forward.experts.7.w3, model.layers.12.feed_forward.experts.8.w1,
model.layers.12.feed_forward.experts.8.w3, model.layers.12.feed_forward.experts.9.w1,
model.layers.12.feed_forward.experts.9.w3, model.layers.12.feed_forward.experts.10.w1,
model.layers.12.feed_forward.experts.10.w3, model.layers.12.feed_forward.experts.11.w1,
model.layers.12.feed_forward.experts.11.w3, model.layers.12.feed_forward.experts.12.w1,
model.layers.12.feed_forward.experts.12.w3, model.layers.12.feed_forward.experts.13.w1,
model.layers.12.feed_forward.experts.13.w3, model.layers.12.feed_forward.experts.14.w1,
model.layers.12.feed_forward.experts.14.w3, model.layers.12.feed_forward.experts.15.w1,
model.layers.12.feed_forward.experts.15.w3, model.layers.12.feed_forward.experts.16.w1,
model.layers.12.feed_forward.experts.16.w3, model.layers.12.feed_forward.experts.17.w1,
model.layers.12.feed_forward.experts.17.w3, model.layers.12.feed_forward.experts.18.w1,
model.layers.12.feed_forward.experts.18.w3, model.layers.12.feed_forward.experts.19.w1,
model.layers.12.feed_forward.experts.19.w3, model.layers.12.feed_forward.experts.20.w1,
model.layers.12.feed_forward.experts.20.w3, model.layers.12.feed_forward.experts.21.w1,
model.layers.12.feed_forward.experts.21.w3, model.layers.12.feed_forward.experts.22.w1,
model.layers.12.feed_forward.experts.22.w3, model.layers.12.feed_forward.experts.23.w1,
model.layers.12.feed_forward.experts.23.w3, model.layers.12.feed_forward.experts.24.w1,
model.layers.12.feed_forward.experts.24.w3, model.layers.12.feed_forward.experts.25.w1,
model.layers.12.feed_forward.experts.25.w3, model.layers.12.feed_forward.experts.26.w1,
model.layers.12.feed_forward.experts.26.w3, model.layers.12.feed_forward.experts.27.w1,
model.layers.12.feed_forward.experts.27.w3, model.layers.12.feed_forward.experts.28.w1,
model.layers.12.feed_forward.experts.28.w3, model.layers.12.feed_forward.experts.29.w1,
model.layers.12.feed_forward.experts.29.w3, model.layers.12.feed_forward.experts.30.w1,
model.layers.12.feed_forward.experts.30.w3, model.layers.12.feed_forward.experts.31.w1,
model.layers.12.feed_forward.experts.31.w3]
- smooth_layer: model.layers.13.operator_norm
balance_layers: [model.layers.13.conv.in_proj]
- smooth_layer: model.layers.13.ffn_norm
balance_layers: [model.layers.13.feed_forward.gate, model.layers.13.feed_forward.experts.0.w1,
model.layers.13.feed_forward.experts.0.w3, model.layers.13.feed_forward.experts.1.w1,
model.layers.13.feed_forward.experts.1.w3, model.layers.13.feed_forward.experts.2.w1,
model.layers.13.feed_forward.experts.2.w3, model.layers.13.feed_forward.experts.3.w1,
model.layers.13.feed_forward.experts.3.w3, model.layers.13.feed_forward.experts.4.w1,
model.layers.13.feed_forward.experts.4.w3, model.layers.13.feed_forward.experts.5.w1,
model.layers.13.feed_forward.experts.5.w3, model.layers.13.feed_forward.experts.6.w1,
model.layers.13.feed_forward.experts.6.w3, model.layers.13.feed_forward.experts.7.w1,
model.layers.13.feed_forward.experts.7.w3, model.layers.13.feed_forward.experts.8.w1,
model.layers.13.feed_forward.experts.8.w3, model.layers.13.feed_forward.experts.9.w1,
model.layers.13.feed_forward.experts.9.w3, model.layers.13.feed_forward.experts.10.w1,
model.layers.13.feed_forward.experts.10.w3, model.layers.13.feed_forward.experts.11.w1,
model.layers.13.feed_forward.experts.11.w3, model.layers.13.feed_forward.experts.12.w1,
model.layers.13.feed_forward.experts.12.w3, model.layers.13.feed_forward.experts.13.w1,
model.layers.13.feed_forward.experts.13.w3, model.layers.13.feed_forward.experts.14.w1,
model.layers.13.feed_forward.experts.14.w3, model.layers.13.feed_forward.experts.15.w1,
model.layers.13.feed_forward.experts.15.w3, model.layers.13.feed_forward.experts.16.w1,
model.layers.13.feed_forward.experts.16.w3, model.layers.13.feed_forward.experts.17.w1,
model.layers.13.feed_forward.experts.17.w3, model.layers.13.feed_forward.experts.18.w1,
model.layers.13.feed_forward.experts.18.w3, model.layers.13.feed_forward.experts.19.w1,
model.layers.13.feed_forward.experts.19.w3, model.layers.13.feed_forward.experts.20.w1,
model.layers.13.feed_forward.experts.20.w3, model.layers.13.feed_forward.experts.21.w1,
model.layers.13.feed_forward.experts.21.w3, model.layers.13.feed_forward.experts.22.w1,
model.layers.13.feed_forward.experts.22.w3, model.layers.13.feed_forward.experts.23.w1,
model.layers.13.feed_forward.experts.23.w3, model.layers.13.feed_forward.experts.24.w1,
model.layers.13.feed_forward.experts.24.w3, model.layers.13.feed_forward.experts.25.w1,
model.layers.13.feed_forward.experts.25.w3, model.layers.13.feed_forward.experts.26.w1,
model.layers.13.feed_forward.experts.26.w3, model.layers.13.feed_forward.experts.27.w1,
model.layers.13.feed_forward.experts.27.w3, model.layers.13.feed_forward.experts.28.w1,
model.layers.13.feed_forward.experts.28.w3, model.layers.13.feed_forward.experts.29.w1,
model.layers.13.feed_forward.experts.29.w3, model.layers.13.feed_forward.experts.30.w1,
model.layers.13.feed_forward.experts.30.w3, model.layers.13.feed_forward.experts.31.w1,
model.layers.13.feed_forward.experts.31.w3]
- smooth_layer: model.layers.14.operator_norm
balance_layers: [model.layers.14.self_attn.q_proj, model.layers.14.self_attn.k_proj,
model.layers.14.self_attn.v_proj]
- smooth_layer: model.layers.14.ffn_norm
balance_layers: [model.layers.14.feed_forward.gate, model.layers.14.feed_forward.experts.0.w1,
model.layers.14.feed_forward.experts.0.w3, model.layers.14.feed_forward.experts.1.w1,
model.layers.14.feed_forward.experts.1.w3, model.layers.14.feed_forward.experts.2.w1,
model.layers.14.feed_forward.experts.2.w3, model.layers.14.feed_forward.experts.3.w1,
model.layers.14.feed_forward.experts.3.w3, model.layers.14.feed_forward.experts.4.w1,
model.layers.14.feed_forward.experts.4.w3, model.layers.14.feed_forward.experts.5.w1,
model.layers.14.feed_forward.experts.5.w3, model.layers.14.feed_forward.experts.6.w1,
model.layers.14.feed_forward.experts.6.w3, model.layers.14.feed_forward.experts.7.w1,
model.layers.14.feed_forward.experts.7.w3, model.layers.14.feed_forward.experts.8.w1,
model.layers.14.feed_forward.experts.8.w3, model.layers.14.feed_forward.experts.9.w1,
model.layers.14.feed_forward.experts.9.w3, model.layers.14.feed_forward.experts.10.w1,
model.layers.14.feed_forward.experts.10.w3, model.layers.14.feed_forward.experts.11.w1,
model.layers.14.feed_forward.experts.11.w3, model.layers.14.feed_forward.experts.12.w1,
model.layers.14.feed_forward.experts.12.w3, model.layers.14.feed_forward.experts.13.w1,
model.layers.14.feed_forward.experts.13.w3, model.layers.14.feed_forward.experts.14.w1,
model.layers.14.feed_forward.experts.14.w3, model.layers.14.feed_forward.experts.15.w1,
model.layers.14.feed_forward.experts.15.w3, model.layers.14.feed_forward.experts.16.w1,
model.layers.14.feed_forward.experts.16.w3, model.layers.14.feed_forward.experts.17.w1,
model.layers.14.feed_forward.experts.17.w3, model.layers.14.feed_forward.experts.18.w1,
model.layers.14.feed_forward.experts.18.w3, model.layers.14.feed_forward.experts.19.w1,
model.layers.14.feed_forward.experts.19.w3, model.layers.14.feed_forward.experts.20.w1,
model.layers.14.feed_forward.experts.20.w3, model.layers.14.feed_forward.experts.21.w1,
model.layers.14.feed_forward.experts.21.w3, model.layers.14.feed_forward.experts.22.w1,
model.layers.14.feed_forward.experts.22.w3, model.layers.14.feed_forward.experts.23.w1,
model.layers.14.feed_forward.experts.23.w3, model.layers.14.feed_forward.experts.24.w1,
model.layers.14.feed_forward.experts.24.w3, model.layers.14.feed_forward.experts.25.w1,
model.layers.14.feed_forward.experts.25.w3, model.layers.14.feed_forward.experts.26.w1,
model.layers.14.feed_forward.experts.26.w3, model.layers.14.feed_forward.experts.27.w1,
model.layers.14.feed_forward.experts.27.w3, model.layers.14.feed_forward.experts.28.w1,
model.layers.14.feed_forward.experts.28.w3, model.layers.14.feed_forward.experts.29.w1,
model.layers.14.feed_forward.experts.29.w3, model.layers.14.feed_forward.experts.30.w1,
model.layers.14.feed_forward.experts.30.w3, model.layers.14.feed_forward.experts.31.w1,
model.layers.14.feed_forward.experts.31.w3]
- smooth_layer: model.layers.15.operator_norm
balance_layers: [model.layers.15.conv.in_proj]
- smooth_layer: model.layers.15.ffn_norm
balance_layers: [model.layers.15.feed_forward.gate, model.layers.15.feed_forward.experts.0.w1,
model.layers.15.feed_forward.experts.0.w3, model.layers.15.feed_forward.experts.1.w1,
model.layers.15.feed_forward.experts.1.w3, model.layers.15.feed_forward.experts.2.w1,
model.layers.15.feed_forward.experts.2.w3, model.layers.15.feed_forward.experts.3.w1,
model.layers.15.feed_forward.experts.3.w3, model.layers.15.feed_forward.experts.4.w1,
model.layers.15.feed_forward.experts.4.w3, model.layers.15.feed_forward.experts.5.w1,
model.layers.15.feed_forward.experts.5.w3, model.layers.15.feed_forward.experts.6.w1,
model.layers.15.feed_forward.experts.6.w3, model.layers.15.feed_forward.experts.7.w1,
model.layers.15.feed_forward.experts.7.w3, model.layers.15.feed_forward.experts.8.w1,
model.layers.15.feed_forward.experts.8.w3, model.layers.15.feed_forward.experts.9.w1,
model.layers.15.feed_forward.experts.9.w3, model.layers.15.feed_forward.experts.10.w1,
model.layers.15.feed_forward.experts.10.w3, model.layers.15.feed_forward.experts.11.w1,
model.layers.15.feed_forward.experts.11.w3, model.layers.15.feed_forward.experts.12.w1,
model.layers.15.feed_forward.experts.12.w3, model.layers.15.feed_forward.experts.13.w1,
model.layers.15.feed_forward.experts.13.w3, model.layers.15.feed_forward.experts.14.w1,
model.layers.15.feed_forward.experts.14.w3, model.layers.15.feed_forward.experts.15.w1,
model.layers.15.feed_forward.experts.15.w3, model.layers.15.feed_forward.experts.16.w1,
model.layers.15.feed_forward.experts.16.w3, model.layers.15.feed_forward.experts.17.w1,
model.layers.15.feed_forward.experts.17.w3, model.layers.15.feed_forward.experts.18.w1,
model.layers.15.feed_forward.experts.18.w3, model.layers.15.feed_forward.experts.19.w1,
model.layers.15.feed_forward.experts.19.w3, model.layers.15.feed_forward.experts.20.w1,
model.layers.15.feed_forward.experts.20.w3, model.layers.15.feed_forward.experts.21.w1,
model.layers.15.feed_forward.experts.21.w3, model.layers.15.feed_forward.experts.22.w1,
model.layers.15.feed_forward.experts.22.w3, model.layers.15.feed_forward.experts.23.w1,
model.layers.15.feed_forward.experts.23.w3, model.layers.15.feed_forward.experts.24.w1,
model.layers.15.feed_forward.experts.24.w3, model.layers.15.feed_forward.experts.25.w1,
model.layers.15.feed_forward.experts.25.w3, model.layers.15.feed_forward.experts.26.w1,
model.layers.15.feed_forward.experts.26.w3, model.layers.15.feed_forward.experts.27.w1,
model.layers.15.feed_forward.experts.27.w3, model.layers.15.feed_forward.experts.28.w1,
model.layers.15.feed_forward.experts.28.w3, model.layers.15.feed_forward.experts.29.w1,
model.layers.15.feed_forward.experts.29.w3, model.layers.15.feed_forward.experts.30.w1,
model.layers.15.feed_forward.experts.30.w3, model.layers.15.feed_forward.experts.31.w1,
model.layers.15.feed_forward.experts.31.w3]
- smooth_layer: model.layers.16.operator_norm
balance_layers: [model.layers.16.conv.in_proj]
- smooth_layer: model.layers.16.ffn_norm
balance_layers: [model.layers.16.feed_forward.gate, model.layers.16.feed_forward.experts.0.w1,
model.layers.16.feed_forward.experts.0.w3, model.layers.16.feed_forward.experts.1.w1,
model.layers.16.feed_forward.experts.1.w3, model.layers.16.feed_forward.experts.2.w1,
model.layers.16.feed_forward.experts.2.w3, model.layers.16.feed_forward.experts.3.w1,
model.layers.16.feed_forward.experts.3.w3, model.layers.16.feed_forward.experts.4.w1,
model.layers.16.feed_forward.experts.4.w3, model.layers.16.feed_forward.experts.5.w1,
model.layers.16.feed_forward.experts.5.w3, model.layers.16.feed_forward.experts.6.w1,
model.layers.16.feed_forward.experts.6.w3, model.layers.16.feed_forward.experts.7.w1,
model.layers.16.feed_forward.experts.7.w3, model.layers.16.feed_forward.experts.8.w1,
model.layers.16.feed_forward.experts.8.w3, model.layers.16.feed_forward.experts.9.w1,
model.layers.16.feed_forward.experts.9.w3, model.layers.16.feed_forward.experts.10.w1,
model.layers.16.feed_forward.experts.10.w3, model.layers.16.feed_forward.experts.11.w1,
model.layers.16.feed_forward.experts.11.w3, model.layers.16.feed_forward.experts.12.w1,
model.layers.16.feed_forward.experts.12.w3, model.layers.16.feed_forward.experts.13.w1,
model.layers.16.feed_forward.experts.13.w3, model.layers.16.feed_forward.experts.14.w1,
model.layers.16.feed_forward.experts.14.w3, model.layers.16.feed_forward.experts.15.w1,
model.layers.16.feed_forward.experts.15.w3, model.layers.16.feed_forward.experts.16.w1,
model.layers.16.feed_forward.experts.16.w3, model.layers.16.feed_forward.experts.17.w1,
model.layers.16.feed_forward.experts.17.w3, model.layers.16.feed_forward.experts.18.w1,
model.layers.16.feed_forward.experts.18.w3, model.layers.16.feed_forward.experts.19.w1,
model.layers.16.feed_forward.experts.19.w3, model.layers.16.feed_forward.experts.20.w1,
model.layers.16.feed_forward.experts.20.w3, model.layers.16.feed_forward.experts.21.w1,
model.layers.16.feed_forward.experts.21.w3, model.layers.16.feed_forward.experts.22.w1,
model.layers.16.feed_forward.experts.22.w3, model.layers.16.feed_forward.experts.23.w1,
model.layers.16.feed_forward.experts.23.w3, model.layers.16.feed_forward.experts.24.w1,
model.layers.16.feed_forward.experts.24.w3, model.layers.16.feed_forward.experts.25.w1,
model.layers.16.feed_forward.experts.25.w3, model.layers.16.feed_forward.experts.26.w1,
model.layers.16.feed_forward.experts.26.w3, model.layers.16.feed_forward.experts.27.w1,
model.layers.16.feed_forward.experts.27.w3, model.layers.16.feed_forward.experts.28.w1,
model.layers.16.feed_forward.experts.28.w3, model.layers.16.feed_forward.experts.29.w1,
model.layers.16.feed_forward.experts.29.w3, model.layers.16.feed_forward.experts.30.w1,
model.layers.16.feed_forward.experts.30.w3, model.layers.16.feed_forward.experts.31.w1,
model.layers.16.feed_forward.experts.31.w3]
- smooth_layer: model.layers.17.operator_norm
balance_layers: [model.layers.17.conv.in_proj]
- smooth_layer: model.layers.17.ffn_norm
balance_layers: [model.layers.17.feed_forward.gate, model.layers.17.feed_forward.experts.0.w1,
model.layers.17.feed_forward.experts.0.w3, model.layers.17.feed_forward.experts.1.w1,
model.layers.17.feed_forward.experts.1.w3, model.layers.17.feed_forward.experts.2.w1,
model.layers.17.feed_forward.experts.2.w3, model.layers.17.feed_forward.experts.3.w1,
model.layers.17.feed_forward.experts.3.w3, model.layers.17.feed_forward.experts.4.w1,
model.layers.17.feed_forward.experts.4.w3, model.layers.17.feed_forward.experts.5.w1,
model.layers.17.feed_forward.experts.5.w3, model.layers.17.feed_forward.experts.6.w1,
model.layers.17.feed_forward.experts.6.w3, model.layers.17.feed_forward.experts.7.w1,
model.layers.17.feed_forward.experts.7.w3, model.layers.17.feed_forward.experts.8.w1,
model.layers.17.feed_forward.experts.8.w3, model.layers.17.feed_forward.experts.9.w1,
model.layers.17.feed_forward.experts.9.w3, model.layers.17.feed_forward.experts.10.w1,
model.layers.17.feed_forward.experts.10.w3, model.layers.17.feed_forward.experts.11.w1,
model.layers.17.feed_forward.experts.11.w3, model.layers.17.feed_forward.experts.12.w1,
model.layers.17.feed_forward.experts.12.w3, model.layers.17.feed_forward.experts.13.w1,
model.layers.17.feed_forward.experts.13.w3, model.layers.17.feed_forward.experts.14.w1,
model.layers.17.feed_forward.experts.14.w3, model.layers.17.feed_forward.experts.15.w1,
model.layers.17.feed_forward.experts.15.w3, model.layers.17.feed_forward.experts.16.w1,
model.layers.17.feed_forward.experts.16.w3, model.layers.17.feed_forward.experts.17.w1,
model.layers.17.feed_forward.experts.17.w3, model.layers.17.feed_forward.experts.18.w1,
model.layers.17.feed_forward.experts.18.w3, model.layers.17.feed_forward.experts.19.w1,
model.layers.17.feed_forward.experts.19.w3, model.layers.17.feed_forward.experts.20.w1,
model.layers.17.feed_forward.experts.20.w3, model.layers.17.feed_forward.experts.21.w1,
model.layers.17.feed_forward.experts.21.w3, model.layers.17.feed_forward.experts.22.w1,
model.layers.17.feed_forward.experts.22.w3, model.layers.17.feed_forward.experts.23.w1,
model.layers.17.feed_forward.experts.23.w3, model.layers.17.feed_forward.experts.24.w1,
model.layers.17.feed_forward.experts.24.w3, model.layers.17.feed_forward.experts.25.w1,
model.layers.17.feed_forward.experts.25.w3, model.layers.17.feed_forward.experts.26.w1,
model.layers.17.feed_forward.experts.26.w3, model.layers.17.feed_forward.experts.27.w1,
model.layers.17.feed_forward.experts.27.w3, model.layers.17.feed_forward.experts.28.w1,
model.layers.17.feed_forward.experts.28.w3, model.layers.17.feed_forward.experts.29.w1,
model.layers.17.feed_forward.experts.29.w3, model.layers.17.feed_forward.experts.30.w1,
model.layers.17.feed_forward.experts.30.w3, model.layers.17.feed_forward.experts.31.w1,
model.layers.17.feed_forward.experts.31.w3]
- smooth_layer: model.layers.18.operator_norm
balance_layers: [model.layers.18.self_attn.q_proj, model.layers.18.self_attn.k_proj,
model.layers.18.self_attn.v_proj]
- smooth_layer: model.layers.18.ffn_norm
balance_layers: [model.layers.18.feed_forward.gate, model.layers.18.feed_forward.experts.0.w1,
model.layers.18.feed_forward.experts.0.w3, model.layers.18.feed_forward.experts.1.w1,
model.layers.18.feed_forward.experts.1.w3, model.layers.18.feed_forward.experts.2.w1,
model.layers.18.feed_forward.experts.2.w3, model.layers.18.feed_forward.experts.3.w1,
model.layers.18.feed_forward.experts.3.w3, model.layers.18.feed_forward.experts.4.w1,
model.layers.18.feed_forward.experts.4.w3, model.layers.18.feed_forward.experts.5.w1,
model.layers.18.feed_forward.experts.5.w3, model.layers.18.feed_forward.experts.6.w1,
model.layers.18.feed_forward.experts.6.w3, model.layers.18.feed_forward.experts.7.w1,
model.layers.18.feed_forward.experts.7.w3, model.layers.18.feed_forward.experts.8.w1,
model.layers.18.feed_forward.experts.8.w3, model.layers.18.feed_forward.experts.9.w1,
model.layers.18.feed_forward.experts.9.w3, model.layers.18.feed_forward.experts.10.w1,
model.layers.18.feed_forward.experts.10.w3, model.layers.18.feed_forward.experts.11.w1,
model.layers.18.feed_forward.experts.11.w3, model.layers.18.feed_forward.experts.12.w1,
model.layers.18.feed_forward.experts.12.w3, model.layers.18.feed_forward.experts.13.w1,
model.layers.18.feed_forward.experts.13.w3, model.layers.18.feed_forward.experts.14.w1,
model.layers.18.feed_forward.experts.14.w3, model.layers.18.feed_forward.experts.15.w1,
model.layers.18.feed_forward.experts.15.w3, model.layers.18.feed_forward.experts.16.w1,
model.layers.18.feed_forward.experts.16.w3, model.layers.18.feed_forward.experts.17.w1,
model.layers.18.feed_forward.experts.17.w3, model.layers.18.feed_forward.experts.18.w1,
model.layers.18.feed_forward.experts.18.w3, model.layers.18.feed_forward.experts.19.w1,
model.layers.18.feed_forward.experts.19.w3, model.layers.18.feed_forward.experts.20.w1,
model.layers.18.feed_forward.experts.20.w3, model.layers.18.feed_forward.experts.21.w1,
model.layers.18.feed_forward.experts.21.w3, model.layers.18.feed_forward.experts.22.w1,
model.layers.18.feed_forward.experts.22.w3, model.layers.18.feed_forward.experts.23.w1,
model.layers.18.feed_forward.experts.23.w3, model.layers.18.feed_forward.experts.24.w1,
model.layers.18.feed_forward.experts.24.w3, model.layers.18.feed_forward.experts.25.w1,
model.layers.18.feed_forward.experts.25.w3, model.layers.18.feed_forward.experts.26.w1,
model.layers.18.feed_forward.experts.26.w3, model.layers.18.feed_forward.experts.27.w1,
model.layers.18.feed_forward.experts.27.w3, model.layers.18.feed_forward.experts.28.w1,
model.layers.18.feed_forward.experts.28.w3, model.layers.18.feed_forward.experts.29.w1,
model.layers.18.feed_forward.experts.29.w3, model.layers.18.feed_forward.experts.30.w1,
model.layers.18.feed_forward.experts.30.w3, model.layers.18.feed_forward.experts.31.w1,
model.layers.18.feed_forward.experts.31.w3]
- smooth_layer: model.layers.19.operator_norm
balance_layers: [model.layers.19.conv.in_proj]
- smooth_layer: model.layers.19.ffn_norm
balance_layers: [model.layers.19.feed_forward.gate, model.layers.19.feed_forward.experts.0.w1,
model.layers.19.feed_forward.experts.0.w3, model.layers.19.feed_forward.experts.1.w1,
model.layers.19.feed_forward.experts.1.w3, model.layers.19.feed_forward.experts.2.w1,
model.layers.19.feed_forward.experts.2.w3, model.layers.19.feed_forward.experts.3.w1,
model.layers.19.feed_forward.experts.3.w3, model.layers.19.feed_forward.experts.4.w1,
model.layers.19.feed_forward.experts.4.w3, model.layers.19.feed_forward.experts.5.w1,
model.layers.19.feed_forward.experts.5.w3, model.layers.19.feed_forward.experts.6.w1,
model.layers.19.feed_forward.experts.6.w3, model.layers.19.feed_forward.experts.7.w1,
model.layers.19.feed_forward.experts.7.w3, model.layers.19.feed_forward.experts.8.w1,
model.layers.19.feed_forward.experts.8.w3, model.layers.19.feed_forward.experts.9.w1,
model.layers.19.feed_forward.experts.9.w3, model.layers.19.feed_forward.experts.10.w1,
model.layers.19.feed_forward.experts.10.w3, model.layers.19.feed_forward.experts.11.w1,
model.layers.19.feed_forward.experts.11.w3, model.layers.19.feed_forward.experts.12.w1,
model.layers.19.feed_forward.experts.12.w3, model.layers.19.feed_forward.experts.13.w1,
model.layers.19.feed_forward.experts.13.w3, model.layers.19.feed_forward.experts.14.w1,
model.layers.19.feed_forward.experts.14.w3, model.layers.19.feed_forward.experts.15.w1,
model.layers.19.feed_forward.experts.15.w3, model.layers.19.feed_forward.experts.16.w1,
model.layers.19.feed_forward.experts.16.w3, model.layers.19.feed_forward.experts.17.w1,
model.layers.19.feed_forward.experts.17.w3, model.layers.19.feed_forward.experts.18.w1,
model.layers.19.feed_forward.experts.18.w3, model.layers.19.feed_forward.experts.19.w1,
model.layers.19.feed_forward.experts.19.w3, model.layers.19.feed_forward.experts.20.w1,
model.layers.19.feed_forward.experts.20.w3, model.layers.19.feed_forward.experts.21.w1,
model.layers.19.feed_forward.experts.21.w3, model.layers.19.feed_forward.experts.22.w1,
model.layers.19.feed_forward.experts.22.w3, model.layers.19.feed_forward.experts.23.w1,
model.layers.19.feed_forward.experts.23.w3, model.layers.19.feed_forward.experts.24.w1,
model.layers.19.feed_forward.experts.24.w3, model.layers.19.feed_forward.experts.25.w1,
model.layers.19.feed_forward.experts.25.w3, model.layers.19.feed_forward.experts.26.w1,
model.layers.19.feed_forward.experts.26.w3, model.layers.19.feed_forward.experts.27.w1,
model.layers.19.feed_forward.experts.27.w3, model.layers.19.feed_forward.experts.28.w1,
model.layers.19.feed_forward.experts.28.w3, model.layers.19.feed_forward.experts.29.w1,
model.layers.19.feed_forward.experts.29.w3, model.layers.19.feed_forward.experts.30.w1,
model.layers.19.feed_forward.experts.30.w3, model.layers.19.feed_forward.experts.31.w1,
model.layers.19.feed_forward.experts.31.w3]
- smooth_layer: model.layers.20.operator_norm
balance_layers: [model.layers.20.conv.in_proj]
- smooth_layer: model.layers.20.ffn_norm
balance_layers: [model.layers.20.feed_forward.gate, model.layers.20.feed_forward.experts.0.w1,
model.layers.20.feed_forward.experts.0.w3, model.layers.20.feed_forward.experts.1.w1,
model.layers.20.feed_forward.experts.1.w3, model.layers.20.feed_forward.experts.2.w1,
model.layers.20.feed_forward.experts.2.w3, model.layers.20.feed_forward.experts.3.w1,
model.layers.20.feed_forward.experts.3.w3, model.layers.20.feed_forward.experts.4.w1,
model.layers.20.feed_forward.experts.4.w3, model.layers.20.feed_forward.experts.5.w1,
model.layers.20.feed_forward.experts.5.w3, model.layers.20.feed_forward.experts.6.w1,
model.layers.20.feed_forward.experts.6.w3, model.layers.20.feed_forward.experts.7.w1,
model.layers.20.feed_forward.experts.7.w3, model.layers.20.feed_forward.experts.8.w1,
model.layers.20.feed_forward.experts.8.w3, model.layers.20.feed_forward.experts.9.w1,
model.layers.20.feed_forward.experts.9.w3, model.layers.20.feed_forward.experts.10.w1,
model.layers.20.feed_forward.experts.10.w3, model.layers.20.feed_forward.experts.11.w1,
model.layers.20.feed_forward.experts.11.w3, model.layers.20.feed_forward.experts.12.w1,
model.layers.20.feed_forward.experts.12.w3, model.layers.20.feed_forward.experts.13.w1,
model.layers.20.feed_forward.experts.13.w3, model.layers.20.feed_forward.experts.14.w1,
model.layers.20.feed_forward.experts.14.w3, model.layers.20.feed_forward.experts.15.w1,
model.layers.20.feed_forward.experts.15.w3, model.layers.20.feed_forward.experts.16.w1,
model.layers.20.feed_forward.experts.16.w3, model.layers.20.feed_forward.experts.17.w1,
model.layers.20.feed_forward.experts.17.w3, model.layers.20.feed_forward.experts.18.w1,
model.layers.20.feed_forward.experts.18.w3, model.layers.20.feed_forward.experts.19.w1,
model.layers.20.feed_forward.experts.19.w3, model.layers.20.feed_forward.experts.20.w1,
model.layers.20.feed_forward.experts.20.w3, model.layers.20.feed_forward.experts.21.w1,
model.layers.20.feed_forward.experts.21.w3, model.layers.20.feed_forward.experts.22.w1,
model.layers.20.feed_forward.experts.22.w3, model.layers.20.feed_forward.experts.23.w1,
model.layers.20.feed_forward.experts.23.w3, model.layers.20.feed_forward.experts.24.w1,
model.layers.20.feed_forward.experts.24.w3, model.layers.20.feed_forward.experts.25.w1,
model.layers.20.feed_forward.experts.25.w3, model.layers.20.feed_forward.experts.26.w1,
model.layers.20.feed_forward.experts.26.w3, model.layers.20.feed_forward.experts.27.w1,
model.layers.20.feed_forward.experts.27.w3, model.layers.20.feed_forward.experts.28.w1,
model.layers.20.feed_forward.experts.28.w3, model.layers.20.feed_forward.experts.29.w1,
model.layers.20.feed_forward.experts.29.w3, model.layers.20.feed_forward.experts.30.w1,
model.layers.20.feed_forward.experts.30.w3, model.layers.20.feed_forward.experts.31.w1,
model.layers.20.feed_forward.experts.31.w3]
- smooth_layer: model.layers.21.operator_norm
balance_layers: [model.layers.21.self_attn.q_proj, model.layers.21.self_attn.k_proj,
model.layers.21.self_attn.v_proj]
- smooth_layer: model.layers.21.ffn_norm
balance_layers: [model.layers.21.feed_forward.gate, model.layers.21.feed_forward.experts.0.w1,
model.layers.21.feed_forward.experts.0.w3, model.layers.21.feed_forward.experts.1.w1,
model.layers.21.feed_forward.experts.1.w3, model.layers.21.feed_forward.experts.2.w1,
model.layers.21.feed_forward.experts.2.w3, model.layers.21.feed_forward.experts.3.w1,
model.layers.21.feed_forward.experts.3.w3, model.layers.21.feed_forward.experts.4.w1,
model.layers.21.feed_forward.experts.4.w3, model.layers.21.feed_forward.experts.5.w1,
model.layers.21.feed_forward.experts.5.w3, model.layers.21.feed_forward.experts.6.w1,
model.layers.21.feed_forward.experts.6.w3, model.layers.21.feed_forward.experts.7.w1,
model.layers.21.feed_forward.experts.7.w3, model.layers.21.feed_forward.experts.8.w1,
model.layers.21.feed_forward.experts.8.w3, model.layers.21.feed_forward.experts.9.w1,
model.layers.21.feed_forward.experts.9.w3, model.layers.21.feed_forward.experts.10.w1,
model.layers.21.feed_forward.experts.10.w3, model.layers.21.feed_forward.experts.11.w1,
model.layers.21.feed_forward.experts.11.w3, model.layers.21.feed_forward.experts.12.w1,
model.layers.21.feed_forward.experts.12.w3, model.layers.21.feed_forward.experts.13.w1,
model.layers.21.feed_forward.experts.13.w3, model.layers.21.feed_forward.experts.14.w1,
model.layers.21.feed_forward.experts.14.w3, model.layers.21.feed_forward.experts.15.w1,
model.layers.21.feed_forward.experts.15.w3, model.layers.21.feed_forward.experts.16.w1,
model.layers.21.feed_forward.experts.16.w3, model.layers.21.feed_forward.experts.17.w1,
model.layers.21.feed_forward.experts.17.w3, model.layers.21.feed_forward.experts.18.w1,
model.layers.21.feed_forward.experts.18.w3, model.layers.21.feed_forward.experts.19.w1,
model.layers.21.feed_forward.experts.19.w3, model.layers.21.feed_forward.experts.20.w1,
model.layers.21.feed_forward.experts.20.w3, model.layers.21.feed_forward.experts.21.w1,
model.layers.21.feed_forward.experts.21.w3, model.layers.21.feed_forward.experts.22.w1,
model.layers.21.feed_forward.experts.22.w3, model.layers.21.feed_forward.experts.23.w1,
model.layers.21.feed_forward.experts.23.w3, model.layers.21.feed_forward.experts.24.w1,
model.layers.21.feed_forward.experts.24.w3, model.layers.21.feed_forward.experts.25.w1,
model.layers.21.feed_forward.experts.25.w3, model.layers.21.feed_forward.experts.26.w1,
model.layers.21.feed_forward.experts.26.w3, model.layers.21.feed_forward.experts.27.w1,
model.layers.21.feed_forward.experts.27.w3, model.layers.21.feed_forward.experts.28.w1,
model.layers.21.feed_forward.experts.28.w3, model.layers.21.feed_forward.experts.29.w1,
model.layers.21.feed_forward.experts.29.w3, model.layers.21.feed_forward.experts.30.w1,
model.layers.21.feed_forward.experts.30.w3, model.layers.21.feed_forward.experts.31.w1,
model.layers.21.feed_forward.experts.31.w3]
- smooth_layer: model.layers.22.operator_norm
balance_layers: [model.layers.22.conv.in_proj]
- smooth_layer: model.layers.22.ffn_norm
balance_layers: [model.layers.22.feed_forward.gate, model.layers.22.feed_forward.experts.0.w1,
model.layers.22.feed_forward.experts.0.w3, model.layers.22.feed_forward.experts.1.w1,
model.layers.22.feed_forward.experts.1.w3, model.layers.22.feed_forward.experts.2.w1,
model.layers.22.feed_forward.experts.2.w3, model.layers.22.feed_forward.experts.3.w1,
model.layers.22.feed_forward.experts.3.w3, model.layers.22.feed_forward.experts.4.w1,
model.layers.22.feed_forward.experts.4.w3, model.layers.22.feed_forward.experts.5.w1,
model.layers.22.feed_forward.experts.5.w3, model.layers.22.feed_forward.experts.6.w1,
model.layers.22.feed_forward.experts.6.w3, model.layers.22.feed_forward.experts.7.w1,
model.layers.22.feed_forward.experts.7.w3, model.layers.22.feed_forward.experts.8.w1,
model.layers.22.feed_forward.experts.8.w3, model.layers.22.feed_forward.experts.9.w1,
model.layers.22.feed_forward.experts.9.w3, model.layers.22.feed_forward.experts.10.w1,
model.layers.22.feed_forward.experts.10.w3, model.layers.22.feed_forward.experts.11.w1,
model.layers.22.feed_forward.experts.11.w3, model.layers.22.feed_forward.experts.12.w1,
model.layers.22.feed_forward.experts.12.w3, model.layers.22.feed_forward.experts.13.w1,
model.layers.22.feed_forward.experts.13.w3, model.layers.22.feed_forward.experts.14.w1,
model.layers.22.feed_forward.experts.14.w3, model.layers.22.feed_forward.experts.15.w1,
model.layers.22.feed_forward.experts.15.w3, model.layers.22.feed_forward.experts.16.w1,
model.layers.22.feed_forward.experts.16.w3, model.layers.22.feed_forward.experts.17.w1,
model.layers.22.feed_forward.experts.17.w3, model.layers.22.feed_forward.experts.18.w1,
model.layers.22.feed_forward.experts.18.w3, model.layers.22.feed_forward.experts.19.w1,
model.layers.22.feed_forward.experts.19.w3, model.layers.22.feed_forward.experts.20.w1,
model.layers.22.feed_forward.experts.20.w3, model.layers.22.feed_forward.experts.21.w1,
model.layers.22.feed_forward.experts.21.w3, model.layers.22.feed_forward.experts.22.w1,
model.layers.22.feed_forward.experts.22.w3, model.layers.22.feed_forward.experts.23.w1,
model.layers.22.feed_forward.experts.23.w3, model.layers.22.feed_forward.experts.24.w1,
model.layers.22.feed_forward.experts.24.w3, model.layers.22.feed_forward.experts.25.w1,
model.layers.22.feed_forward.experts.25.w3, model.layers.22.feed_forward.experts.26.w1,
model.layers.22.feed_forward.experts.26.w3, model.layers.22.feed_forward.experts.27.w1,
model.layers.22.feed_forward.experts.27.w3, model.layers.22.feed_forward.experts.28.w1,
model.layers.22.feed_forward.experts.28.w3, model.layers.22.feed_forward.experts.29.w1,
model.layers.22.feed_forward.experts.29.w3, model.layers.22.feed_forward.experts.30.w1,
model.layers.22.feed_forward.experts.30.w3, model.layers.22.feed_forward.experts.31.w1,
model.layers.22.feed_forward.experts.31.w3]
- smooth_layer: model.layers.23.operator_norm
balance_layers: [model.layers.23.conv.in_proj]
- smooth_layer: model.layers.23.ffn_norm
balance_layers: [model.layers.23.feed_forward.gate, model.layers.23.feed_forward.experts.0.w1,
model.layers.23.feed_forward.experts.0.w3, model.layers.23.feed_forward.experts.1.w1,
model.layers.23.feed_forward.experts.1.w3, model.layers.23.feed_forward.experts.2.w1,
model.layers.23.feed_forward.experts.2.w3, model.layers.23.feed_forward.experts.3.w1,
model.layers.23.feed_forward.experts.3.w3, model.layers.23.feed_forward.experts.4.w1,
model.layers.23.feed_forward.experts.4.w3, model.layers.23.feed_forward.experts.5.w1,
model.layers.23.feed_forward.experts.5.w3, model.layers.23.feed_forward.experts.6.w1,
model.layers.23.feed_forward.experts.6.w3, model.layers.23.feed_forward.experts.7.w1,
model.layers.23.feed_forward.experts.7.w3, model.layers.23.feed_forward.experts.8.w1,
model.layers.23.feed_forward.experts.8.w3, model.layers.23.feed_forward.experts.9.w1,
model.layers.23.feed_forward.experts.9.w3, model.layers.23.feed_forward.experts.10.w1,
model.layers.23.feed_forward.experts.10.w3, model.layers.23.feed_forward.experts.11.w1,
model.layers.23.feed_forward.experts.11.w3, model.layers.23.feed_forward.experts.12.w1,
model.layers.23.feed_forward.experts.12.w3, model.layers.23.feed_forward.experts.13.w1,
model.layers.23.feed_forward.experts.13.w3, model.layers.23.feed_forward.experts.14.w1,
model.layers.23.feed_forward.experts.14.w3, model.layers.23.feed_forward.experts.15.w1,
model.layers.23.feed_forward.experts.15.w3, model.layers.23.feed_forward.experts.16.w1,
model.layers.23.feed_forward.experts.16.w3, model.layers.23.feed_forward.experts.17.w1,
model.layers.23.feed_forward.experts.17.w3, model.layers.23.feed_forward.experts.18.w1,
model.layers.23.feed_forward.experts.18.w3, model.layers.23.feed_forward.experts.19.w1,
model.layers.23.feed_forward.experts.19.w3, model.layers.23.feed_forward.experts.20.w1,
model.layers.23.feed_forward.experts.20.w3, model.layers.23.feed_forward.experts.21.w1,
model.layers.23.feed_forward.experts.21.w3, model.layers.23.feed_forward.experts.22.w1,
model.layers.23.feed_forward.experts.22.w3, model.layers.23.feed_forward.experts.23.w1,
model.layers.23.feed_forward.experts.23.w3, model.layers.23.feed_forward.experts.24.w1,
model.layers.23.feed_forward.experts.24.w3, model.layers.23.feed_forward.experts.25.w1,
model.layers.23.feed_forward.experts.25.w3, model.layers.23.feed_forward.experts.26.w1,
model.layers.23.feed_forward.experts.26.w3, model.layers.23.feed_forward.experts.27.w1,
model.layers.23.feed_forward.experts.27.w3, model.layers.23.feed_forward.experts.28.w1,
model.layers.23.feed_forward.experts.28.w3, model.layers.23.feed_forward.experts.29.w1,
model.layers.23.feed_forward.experts.29.w3, model.layers.23.feed_forward.experts.30.w1,
model.layers.23.feed_forward.experts.30.w3, model.layers.23.feed_forward.experts.31.w1,
model.layers.23.feed_forward.experts.31.w3]
- smooth_layer: model.embedding_norm
balance_layers: [lm_head]
duo_scaling: true