AmirMohseni commited on
Commit
249021e
·
verified ·
1 Parent(s): d0518e3

Step 100 checkpoint

Browse files
Files changed (7) hide show
  1. config.json +1 -1
  2. model.safetensors +2 -2
  3. optimizer.pt +2 -2
  4. rng_state.pth +1 -1
  5. scheduler.pt +1 -1
  6. trainer_state.json +49 -131
  7. training_args.bin +1 -1
config.json CHANGED
@@ -5,7 +5,7 @@
5
  "attention_bias": false,
6
  "attention_dropout": 0.0,
7
  "bos_token_id": 151643,
8
- "dtype": "float32",
9
  "eos_token_id": 151645,
10
  "head_dim": 128,
11
  "hidden_act": "silu",
 
5
  "attention_bias": false,
6
  "attention_dropout": 0.0,
7
  "bos_token_id": 151643,
8
+ "dtype": "bfloat16",
9
  "eos_token_id": 151645,
10
  "head_dim": 128,
11
  "hidden_act": "silu",
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:42c583567aff6763908a4a70c95ed07632a1b3817042ddbb9ad01b7314f01f4c
3
- size 2383149616
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25a7c66986e22717fe2077f209ad1a22a84e7795f69a1c6341faf45ce6fb7cb2
3
+ size 1191592464
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ec3ad5c8c396ff7e98c5b9125046e29d8fa5acd301c9b9b512936681dfbc2daf
3
- size 4766499383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58704f65ae428ad54e53b5abff9d2a289b2e10cdb85a077e65126e7ffdd13fd8
3
+ size 2383380107
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f4a9f217e852f439efa6bd32fde98d6867f11aa6ea13ddc021ba10af6a0b0934
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
3
  size 14645
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:23e0a0c2405202381e8a2ca3d56409fadc1c3b478272da41961a648af745ca8f
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f324ed80934e0c10732eee7128f865a736e9560cf0a916edb7195451de6caad4
3
  size 1465
trainer_state.json CHANGED
@@ -1,188 +1,106 @@
1
  {
2
  "best_global_step": 100,
3
- "best_metric": 0.5881340509962598,
4
- "best_model_checkpoint": "./experiments/qwen3-0.6b-router-lr1e-5-ep2-batch16-20250917-10:12/checkpoint-100",
5
- "epoch": 0.046484601975595584,
6
  "eval_steps": 100,
7
- "global_step": 200,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.00023242300987797793,
14
- "grad_norm": 436.9029846191406,
15
  "learning_rate": 0.0,
16
- "loss": 3.2609,
17
  "step": 1
18
  },
19
  {
20
- "epoch": 0.002324230098779779,
21
- "grad_norm": 212.0109405517578,
22
  "learning_rate": 9.000000000000001e-07,
23
- "loss": 2.0978,
24
  "step": 10
25
  },
26
  {
27
- "epoch": 0.004648460197559558,
28
- "grad_norm": 199.0899658203125,
29
  "learning_rate": 1.9000000000000002e-06,
30
- "loss": 1.849,
31
  "step": 20
32
  },
33
  {
34
- "epoch": 0.006972690296339338,
35
- "grad_norm": 195.7217559814453,
36
  "learning_rate": 2.9e-06,
37
- "loss": 1.8741,
38
  "step": 30
39
  },
40
  {
41
- "epoch": 0.009296920395119116,
42
- "grad_norm": 200.8784942626953,
43
  "learning_rate": 3.900000000000001e-06,
44
- "loss": 1.8469,
45
  "step": 40
46
  },
47
  {
48
- "epoch": 0.011621150493898896,
49
- "grad_norm": 219.49838256835938,
50
  "learning_rate": 4.9000000000000005e-06,
51
- "loss": 1.8834,
52
  "step": 50
53
  },
54
  {
55
- "epoch": 0.013945380592678676,
56
- "grad_norm": 131.50750732421875,
57
  "learning_rate": 5.9e-06,
58
- "loss": 1.9368,
59
  "step": 60
60
  },
61
  {
62
- "epoch": 0.016269610691458453,
63
- "grad_norm": 75.23955535888672,
64
  "learning_rate": 6.9e-06,
65
- "loss": 1.9056,
66
  "step": 70
67
  },
68
  {
69
- "epoch": 0.018593840790238233,
70
- "grad_norm": 68.3598861694336,
71
  "learning_rate": 7.9e-06,
72
- "loss": 1.5776,
73
  "step": 80
74
  },
75
  {
76
- "epoch": 0.020918070889018012,
77
- "grad_norm": 77.42771911621094,
78
  "learning_rate": 8.900000000000001e-06,
79
- "loss": 1.4744,
80
  "step": 90
81
  },
82
  {
83
- "epoch": 0.023242300987797792,
84
- "grad_norm": 69.98379516601562,
85
  "learning_rate": 9.9e-06,
86
- "loss": 1.7182,
87
  "step": 100
88
  },
89
  {
90
- "epoch": 0.023242300987797792,
91
- "eval_accuracy": 0.5880728879072336,
92
- "eval_f1": 0.5881340509962598,
93
- "eval_loss": 0.7600654363632202,
94
- "eval_precision": 0.5882081113096272,
95
- "eval_recall": 0.5880728879072336,
96
- "eval_runtime": 21.2273,
97
- "eval_samples_per_second": 85.315,
98
- "eval_steps_per_second": 10.694,
99
  "step": 100
100
- },
101
- {
102
- "epoch": 0.02556653108657757,
103
- "grad_norm": 41.54159927368164,
104
- "learning_rate": 9.989419233482248e-06,
105
- "loss": 1.3959,
106
- "step": 110
107
- },
108
- {
109
- "epoch": 0.02789076118535735,
110
- "grad_norm": 28.72027015686035,
111
- "learning_rate": 9.977662826240301e-06,
112
- "loss": 1.5775,
113
- "step": 120
114
- },
115
- {
116
- "epoch": 0.03021499128413713,
117
- "grad_norm": 25.74260902404785,
118
- "learning_rate": 9.965906418998354e-06,
119
- "loss": 1.3155,
120
- "step": 130
121
- },
122
- {
123
- "epoch": 0.03253922138291691,
124
- "grad_norm": 63.235862731933594,
125
- "learning_rate": 9.954150011756408e-06,
126
- "loss": 1.496,
127
- "step": 140
128
- },
129
- {
130
- "epoch": 0.034863451481696686,
131
- "grad_norm": 57.411529541015625,
132
- "learning_rate": 9.94239360451446e-06,
133
- "loss": 1.5263,
134
- "step": 150
135
- },
136
- {
137
- "epoch": 0.037187681580476466,
138
- "grad_norm": 44.55034255981445,
139
- "learning_rate": 9.930637197272515e-06,
140
- "loss": 1.2156,
141
- "step": 160
142
- },
143
- {
144
- "epoch": 0.039511911679256245,
145
- "grad_norm": 44.788516998291016,
146
- "learning_rate": 9.918880790030567e-06,
147
- "loss": 1.3689,
148
- "step": 170
149
- },
150
- {
151
- "epoch": 0.041836141778036025,
152
- "grad_norm": 24.783201217651367,
153
- "learning_rate": 9.907124382788622e-06,
154
- "loss": 1.3457,
155
- "step": 180
156
- },
157
- {
158
- "epoch": 0.044160371876815804,
159
- "grad_norm": 43.4869384765625,
160
- "learning_rate": 9.895367975546673e-06,
161
- "loss": 1.392,
162
- "step": 190
163
- },
164
- {
165
- "epoch": 0.046484601975595584,
166
- "grad_norm": 21.64163589477539,
167
- "learning_rate": 9.883611568304728e-06,
168
- "loss": 1.2688,
169
- "step": 200
170
- },
171
- {
172
- "epoch": 0.046484601975595584,
173
- "eval_accuracy": 0.5394809497515185,
174
- "eval_f1": 0.4162900198419928,
175
- "eval_loss": 0.7904826998710632,
176
- "eval_precision": 0.5986918178262048,
177
- "eval_recall": 0.5394809497515185,
178
- "eval_runtime": 23.8222,
179
- "eval_samples_per_second": 76.021,
180
- "eval_steps_per_second": 9.529,
181
- "step": 200
182
  }
183
  ],
184
  "logging_steps": 10,
185
- "max_steps": 8606,
186
  "num_input_tokens_seen": 0,
187
  "num_train_epochs": 2,
188
  "save_steps": 100,
@@ -198,8 +116,8 @@
198
  "attributes": {}
199
  }
200
  },
201
- "total_flos": 4883735577526272.0,
202
- "train_batch_size": 8,
203
  "trial_name": null,
204
  "trial_params": null
205
  }
 
1
  {
2
  "best_global_step": 100,
3
+ "best_metric": 0.5420321197439284,
4
+ "best_model_checkpoint": "./experiments/qwen3-0.6b-router-lr1e-5-ep2-batch20-20250917-11:04/checkpoint-100",
5
+ "epoch": 0.02905287623474724,
6
  "eval_steps": 100,
7
+ "global_step": 100,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.0002905287623474724,
14
+ "grad_norm": 422.0,
15
  "learning_rate": 0.0,
16
+ "loss": 4.1917,
17
  "step": 1
18
  },
19
  {
20
+ "epoch": 0.002905287623474724,
21
+ "grad_norm": 260.0,
22
  "learning_rate": 9.000000000000001e-07,
23
+ "loss": 3.1155,
24
  "step": 10
25
  },
26
  {
27
+ "epoch": 0.005810575246949448,
28
+ "grad_norm": 362.0,
29
  "learning_rate": 1.9000000000000002e-06,
30
+ "loss": 3.6823,
31
  "step": 20
32
  },
33
  {
34
+ "epoch": 0.008715862870424172,
35
+ "grad_norm": 362.0,
36
  "learning_rate": 2.9e-06,
37
+ "loss": 3.5894,
38
  "step": 30
39
  },
40
  {
41
+ "epoch": 0.011621150493898896,
42
+ "grad_norm": 286.0,
43
  "learning_rate": 3.900000000000001e-06,
44
+ "loss": 2.7935,
45
  "step": 40
46
  },
47
  {
48
+ "epoch": 0.01452643811737362,
49
+ "grad_norm": 108.5,
50
  "learning_rate": 4.9000000000000005e-06,
51
+ "loss": 2.1466,
52
  "step": 50
53
  },
54
  {
55
+ "epoch": 0.017431725740848343,
56
+ "grad_norm": 171.0,
57
  "learning_rate": 5.9e-06,
58
+ "loss": 2.174,
59
  "step": 60
60
  },
61
  {
62
+ "epoch": 0.02033701336432307,
63
+ "grad_norm": 182.0,
64
  "learning_rate": 6.9e-06,
65
+ "loss": 1.9269,
66
  "step": 70
67
  },
68
  {
69
+ "epoch": 0.023242300987797792,
70
+ "grad_norm": 152.0,
71
  "learning_rate": 7.9e-06,
72
+ "loss": 2.0153,
73
  "step": 80
74
  },
75
  {
76
+ "epoch": 0.026147588611272515,
77
+ "grad_norm": 124.5,
78
  "learning_rate": 8.900000000000001e-06,
79
+ "loss": 1.7756,
80
  "step": 90
81
  },
82
  {
83
+ "epoch": 0.02905287623474724,
84
+ "grad_norm": 220.0,
85
  "learning_rate": 9.9e-06,
86
+ "loss": 1.7598,
87
  "step": 100
88
  },
89
  {
90
+ "epoch": 0.02905287623474724,
91
+ "eval_accuracy": 0.5466593042517945,
92
+ "eval_f1": 0.5420321197439284,
93
+ "eval_loss": 0.8443426489830017,
94
+ "eval_precision": 0.5556366418233831,
95
+ "eval_recall": 0.5466593042517945,
96
+ "eval_runtime": 17.2355,
97
+ "eval_samples_per_second": 105.074,
98
+ "eval_steps_per_second": 10.56,
99
  "step": 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  }
101
  ],
102
  "logging_steps": 10,
103
+ "max_steps": 6884,
104
  "num_input_tokens_seen": 0,
105
  "num_train_epochs": 2,
106
  "save_steps": 100,
 
116
  "attributes": {}
117
  }
118
  },
119
+ "total_flos": 3475251530219520.0,
120
+ "train_batch_size": 10,
121
  "trial_name": null,
122
  "trial_params": null
123
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:47e77b9b45b10de55d9b76f3adeba1743c0110e08e02f549e795b1c0fe6ff3f3
3
  size 5905
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45c8c810968be5df474115cdf0e9ea73d835ad3273bd65a53866a284726a0e67
3
  size 5905