wangrongsheng commited on
Commit
5545668
·
verified ·
1 Parent(s): fb778bd

fix config

Browse files
Files changed (1) hide show
  1. config.json +12 -117
config.json CHANGED
@@ -8,25 +8,26 @@
8
  "AutoModel": "modeling_openpangu_moe.PanguUltraMoEModel",
9
  "AutoModelForCausalLM": "modeling_openpangu_moe.PanguUltraMoEForCausalLM"
10
  },
11
- "num_dense_layers": 3,
12
  "hidden_act": "silu",
13
  "hidden_size": 7680,
14
  "initializer_range": 0.02,
15
  "intermediate_size": 18432,
16
- "attention_kv_lora_dim": 512,
17
  "max_position_embeddings": 131072,
18
  "model_type": "pangu_ultra_moe",
19
  "moe_intermediate_size": 2048,
20
- "num_routed_experts": 256,
21
- "num_shared_experts": 1,
 
22
  "num_attention_heads": 128,
23
  "num_experts_per_tok": 8,
24
  "num_hidden_layers": 61,
25
  "num_key_value_heads": 128,
26
- "num_mtp_layers": 1,
27
- "attention_q_lora_dim": 1536,
28
- "attention_qk_dim": 128,
29
- "attention_qk_rope_dim": 64,
30
  "rms_norm_eps": 1e-05,
31
  "rope_theta": 25600000,
32
  "routed_scaling_factor": 2.5,
@@ -35,112 +36,6 @@
35
  "torch_dtype": "bfloat16",
36
  "transformers_version": "4.48.2",
37
  "use_cache": true,
38
- "attention_v_dim": 128,
39
- "vocab_size": 153600,
40
- "quantize": "w8a8_dynamic",
41
- "quantization_config": {
42
- "config_groups": {
43
- "group_0": {
44
- "input_activations": {
45
- "actorder": null,
46
- "block_structure": null,
47
- "dynamic": true,
48
- "group_size": null,
49
- "num_bits": 8,
50
- "observer": "memoryless",
51
- "observer_kwargs": {},
52
- "strategy": "token",
53
- "symmetric": true,
54
- "type": "int"
55
- },
56
- "output_activations": null,
57
- "targets": [
58
- "Linear"
59
- ],
60
- "weights": {
61
- "actorder": null,
62
- "block_structure": null,
63
- "dynamic": false,
64
- "group_size": null,
65
- "num_bits": 8,
66
- "observer": "minmax",
67
- "observer_kwargs": {},
68
- "strategy": "channel",
69
- "symmetric": true,
70
- "type": "int"
71
- }
72
- }
73
- },
74
- "format": "int-quantized",
75
- "global_compression_ratio": 1.5943962512751308,
76
- "ignore": [
77
- "model.layers.0.self_attn.kv_b_proj",
78
- "model.layers.1.self_attn.kv_b_proj",
79
- "model.layers.2.self_attn.kv_b_proj",
80
- "model.layers.3.self_attn.kv_b_proj",
81
- "model.layers.4.self_attn.kv_b_proj",
82
- "model.layers.5.self_attn.kv_b_proj",
83
- "model.layers.6.self_attn.kv_b_proj",
84
- "model.layers.7.self_attn.kv_b_proj",
85
- "model.layers.8.self_attn.kv_b_proj",
86
- "model.layers.9.self_attn.kv_b_proj",
87
- "model.layers.10.self_attn.kv_b_proj",
88
- "model.layers.11.self_attn.kv_b_proj",
89
- "model.layers.12.self_attn.kv_b_proj",
90
- "model.layers.13.self_attn.kv_b_proj",
91
- "model.layers.14.self_attn.kv_b_proj",
92
- "model.layers.15.self_attn.kv_b_proj",
93
- "model.layers.16.self_attn.kv_b_proj",
94
- "model.layers.17.self_attn.kv_b_proj",
95
- "model.layers.18.self_attn.kv_b_proj",
96
- "model.layers.19.self_attn.kv_b_proj",
97
- "model.layers.20.self_attn.kv_b_proj",
98
- "model.layers.21.self_attn.kv_b_proj",
99
- "model.layers.22.self_attn.kv_b_proj",
100
- "model.layers.23.self_attn.kv_b_proj",
101
- "model.layers.24.self_attn.kv_b_proj",
102
- "model.layers.25.self_attn.kv_b_proj",
103
- "model.layers.26.self_attn.kv_b_proj",
104
- "model.layers.27.self_attn.kv_b_proj",
105
- "model.layers.28.self_attn.kv_b_proj",
106
- "model.layers.29.self_attn.kv_b_proj",
107
- "model.layers.30.self_attn.kv_b_proj",
108
- "model.layers.31.self_attn.kv_b_proj",
109
- "model.layers.32.self_attn.kv_b_proj",
110
- "model.layers.33.self_attn.kv_b_proj",
111
- "model.layers.34.self_attn.kv_b_proj",
112
- "model.layers.35.self_attn.kv_b_proj",
113
- "model.layers.36.self_attn.kv_b_proj",
114
- "model.layers.37.self_attn.kv_b_proj",
115
- "model.layers.38.self_attn.kv_b_proj",
116
- "model.layers.39.self_attn.kv_b_proj",
117
- "model.layers.40.self_attn.kv_b_proj",
118
- "model.layers.41.self_attn.kv_b_proj",
119
- "model.layers.42.self_attn.kv_b_proj",
120
- "model.layers.43.self_attn.kv_b_proj",
121
- "model.layers.44.self_attn.kv_b_proj",
122
- "model.layers.45.self_attn.kv_b_proj",
123
- "model.layers.46.self_attn.kv_b_proj",
124
- "model.layers.47.self_attn.kv_b_proj",
125
- "model.layers.48.self_attn.kv_b_proj",
126
- "model.layers.49.self_attn.kv_b_proj",
127
- "model.layers.50.self_attn.kv_b_proj",
128
- "model.layers.51.self_attn.kv_b_proj",
129
- "model.layers.52.self_attn.kv_b_proj",
130
- "model.layers.53.self_attn.kv_b_proj",
131
- "model.layers.54.self_attn.kv_b_proj",
132
- "model.layers.55.self_attn.kv_b_proj",
133
- "model.layers.56.self_attn.kv_b_proj",
134
- "model.layers.57.self_attn.kv_b_proj",
135
- "model.layers.58.self_attn.kv_b_proj",
136
- "model.layers.59.self_attn.kv_b_proj",
137
- "model.layers.60.self_attn.kv_b_proj",
138
- "lm_head",
139
- "model.layers.61.self_attn.kv_b_proj",
140
- "model.layers.61.shared_head.head"
141
- ],
142
- "kv_cache_scheme": null,
143
- "quant_method": "compressed-tensors",
144
- "quantization_status": "compressed"
145
- }
146
- }
 
8
  "AutoModel": "modeling_openpangu_moe.PanguUltraMoEModel",
9
  "AutoModelForCausalLM": "modeling_openpangu_moe.PanguUltraMoEForCausalLM"
10
  },
11
+ "first_k_dense_replace": 3,
12
  "hidden_act": "silu",
13
  "hidden_size": 7680,
14
  "initializer_range": 0.02,
15
  "intermediate_size": 18432,
16
+ "kv_lora_rank": 512,
17
  "max_position_embeddings": 131072,
18
  "model_type": "pangu_ultra_moe",
19
  "moe_intermediate_size": 2048,
20
+ "n_routed_experts": 256,
21
+ "n_shared_experts": 1,
22
+ "norm_topk_prob": true,
23
  "num_attention_heads": 128,
24
  "num_experts_per_tok": 8,
25
  "num_hidden_layers": 61,
26
  "num_key_value_heads": 128,
27
+ "num_nextn_predict_layers": 1,
28
+ "q_lora_rank": 1536,
29
+ "qk_nope_head_dim": 128,
30
+ "qk_rope_head_dim": 64,
31
  "rms_norm_eps": 1e-05,
32
  "rope_theta": 25600000,
33
  "routed_scaling_factor": 2.5,
 
36
  "torch_dtype": "bfloat16",
37
  "transformers_version": "4.48.2",
38
  "use_cache": true,
39
+ "v_head_dim": 128,
40
+ "vocab_size": 153600
41
+ }