MAGICYA0 commited on
Commit
c4444d4
·
verified ·
1 Parent(s): 09e0072

End of training

Browse files
README.md CHANGED
@@ -40,7 +40,7 @@ This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing
40
 
41
  - TRL: 0.15.2
42
  - Transformers: 4.48.2
43
- - Pytorch: 2.5.1+cu121
44
  - Datasets: 3.6.0
45
  - Tokenizers: 0.21.1
46
 
 
40
 
41
  - TRL: 0.15.2
42
  - Transformers: 4.48.2
43
+ - Pytorch: 2.5.1
44
  - Datasets: 3.6.0
45
  - Tokenizers: 0.21.1
46
 
adapter_config.json CHANGED
@@ -24,12 +24,12 @@
24
  "rank_pattern": {},
25
  "revision": null,
26
  "target_modules": [
 
 
 
27
  "gate_proj",
28
  "up_proj",
29
  "q_proj",
30
- "o_proj",
31
- "down_proj",
32
- "k_proj",
33
  "v_proj"
34
  ],
35
  "task_type": "CAUSAL_LM",
 
24
  "rank_pattern": {},
25
  "revision": null,
26
  "target_modules": [
27
+ "o_proj",
28
+ "k_proj",
29
+ "down_proj",
30
  "gate_proj",
31
  "up_proj",
32
  "q_proj",
 
 
 
33
  "v_proj"
34
  ],
35
  "task_type": "CAUSAL_LM",
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:96970c5caa74a034e0f75503d062ccf3c79aa1e4bb1da783621dc643a7ef9a3d
3
  size 35237104
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc45f3686b0cba067c51fcf726eebe499a2e6560ef66389a088444fda4850481
3
  size 35237104
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 8.942109707277268e-05,
4
- "train_runtime": 4099.4474,
5
- "train_samples": 262,
6
- "train_samples_per_second": 6.245,
7
- "train_steps_per_second": 0.049
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 5.2903332107234745e-06,
4
+ "train_runtime": 1863.6299,
5
+ "train_samples": 84,
6
+ "train_samples_per_second": 6.868,
7
+ "train_steps_per_second": 0.054
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 8.942109707277268e-05,
4
- "train_runtime": 4099.4474,
5
- "train_samples": 262,
6
- "train_samples_per_second": 6.245,
7
- "train_steps_per_second": 0.049
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 5.2903332107234745e-06,
4
+ "train_runtime": 1863.6299,
5
+ "train_samples": 84,
6
+ "train_samples_per_second": 6.868,
7
+ "train_steps_per_second": 0.054
8
  }
trainer_state.json CHANGED
@@ -1,787 +1,407 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 11.793893129770993,
5
  "eval_steps": 500,
6
- "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "completion_length": 228.6078125,
13
- "epoch": 0.3053435114503817,
14
- "grad_norm": 1.6697295904159546,
15
- "kl": 0.08263390087522567,
16
- "learning_rate": 2.5e-07,
17
- "loss": 0.0001,
18
- "reward": 2.8884417802095412,
19
- "reward_std": 3.0484682977199555,
20
- "rewards/concensus_correctness_reward_func": 1.5492031248286366,
21
- "rewards/consensus_reward_func": 0.2,
22
- "rewards/cumulative_reward_2": 0.0,
23
- "rewards/final_correctness_reward_func": 0.21875,
24
- "rewards/question_recreation_reward_func": 0.2991667349822819,
25
- "rewards/soft_format_reward_func": 0.00078125,
26
- "rewards/strict_format_reward_func": 0.00078125,
27
- "rewards/xmlcount_reward_func": 0.6197593748569489,
28
- "step": 5
29
- },
30
- {
31
- "completion_length": 225.8765625,
32
- "epoch": 0.6106870229007634,
33
- "grad_norm": 1.4577786922454834,
34
- "kl": 0.08271063729189336,
35
  "learning_rate": 5e-07,
36
- "loss": 0.0001,
37
- "reward": 2.161433610320091,
38
- "reward_std": 2.5517727360129356,
39
- "rewards/concensus_correctness_reward_func": 0.857334372587502,
40
- "rewards/consensus_reward_func": 0.203125,
41
- "rewards/cumulative_reward_2": 0.0,
42
- "rewards/final_correctness_reward_func": 0.196875,
43
- "rewards/question_recreation_reward_func": 0.2814491997472942,
44
- "rewards/soft_format_reward_func": 0.00234375,
45
- "rewards/strict_format_reward_func": 0.0015625,
46
- "rewards/xmlcount_reward_func": 0.6187437549233437,
47
- "step": 10
48
- },
49
- {
50
- "completion_length": 223.1671875,
51
- "epoch": 0.916030534351145,
52
- "grad_norm": 40.90731430053711,
53
- "kl": 0.08677452662959695,
54
- "learning_rate": 4.991461232516674e-07,
55
- "loss": 0.0001,
56
- "reward": 2.2503652930259705,
57
- "reward_std": 2.3072509482502936,
58
- "rewards/concensus_correctness_reward_func": 0.9038093611598015,
59
- "rewards/consensus_reward_func": 0.209375,
60
- "rewards/cumulative_reward_2": 0.0,
61
- "rewards/final_correctness_reward_func": 0.18125,
62
- "rewards/question_recreation_reward_func": 0.3275418411940336,
63
  "rewards/soft_format_reward_func": 0.00078125,
64
- "rewards/strict_format_reward_func": 0.0015625,
65
- "rewards/xmlcount_reward_func": 0.6260453149676323,
66
- "step": 15
67
  },
68
  {
69
- "completion_length": 225.08928571428572,
70
- "epoch": 1.183206106870229,
71
- "grad_norm": 2.2080225944519043,
72
- "kl": 0.09310175127216748,
73
  "learning_rate": 4.965903258506806e-07,
74
- "loss": 0.0001,
75
- "reward": 2.451156176839556,
76
- "reward_std": 2.8527250630514964,
77
- "rewards/concensus_correctness_reward_func": 1.0764642748449529,
78
- "rewards/consensus_reward_func": 0.26071428571428573,
79
- "rewards/cumulative_reward_2": 0.0,
80
- "rewards/final_correctness_reward_func": 0.22857142857142856,
81
- "rewards/question_recreation_reward_func": 0.2879704715950148,
82
- "rewards/soft_format_reward_func": 0.0008928571428571428,
83
- "rewards/strict_format_reward_func": 0.0008928571428571428,
84
- "rewards/xmlcount_reward_func": 0.5956499985286168,
85
- "step": 20
86
- },
87
- {
88
- "completion_length": 227.5796875,
89
- "epoch": 1.4885496183206106,
90
- "grad_norm": 1.866131067276001,
91
- "kl": 0.08186168665997684,
92
- "learning_rate": 4.923500664848326e-07,
93
- "loss": 0.0001,
94
- "reward": 2.17223744392395,
95
- "reward_std": 2.3266281485557556,
96
- "rewards/concensus_correctness_reward_func": 0.822290619648993,
97
- "rewards/consensus_reward_func": 0.2,
98
- "rewards/cumulative_reward_2": 0.0,
99
- "rewards/final_correctness_reward_func": 0.1625,
100
- "rewards/question_recreation_reward_func": 0.36023432165384295,
101
- "rewards/soft_format_reward_func": 0.0015625,
102
- "rewards/strict_format_reward_func": 0.00078125,
103
- "rewards/xmlcount_reward_func": 0.6248687498271466,
104
- "step": 25
105
  },
106
  {
107
- "completion_length": 227.615625,
108
- "epoch": 1.7938931297709924,
109
- "grad_norm": 1.6272974014282227,
110
- "kl": 0.08450978258624672,
111
  "learning_rate": 4.864543104251586e-07,
112
- "loss": 0.0001,
113
- "reward": 2.169202183187008,
114
- "reward_std": 2.4988188624382017,
115
- "rewards/concensus_correctness_reward_func": 0.8738718747161329,
116
- "rewards/consensus_reward_func": 0.196875,
117
- "rewards/cumulative_reward_2": 0.0,
118
- "rewards/final_correctness_reward_func": 0.215625,
119
- "rewards/question_recreation_reward_func": 0.2872584268450737,
120
- "rewards/soft_format_reward_func": 0.00078125,
121
- "rewards/strict_format_reward_func": 0.00234375,
122
- "rewards/xmlcount_reward_func": 0.5924468792974948,
123
- "step": 30
124
- },
125
- {
126
- "completion_length": 221.56607142857143,
127
- "epoch": 2.0610687022900764,
128
- "grad_norm": 1.8112502098083496,
129
- "kl": 0.09125068480415004,
130
- "learning_rate": 4.789433316637643e-07,
131
- "loss": 0.0001,
132
- "reward": 2.663744555200849,
133
- "reward_std": 3.0470116555690767,
134
- "rewards/concensus_correctness_reward_func": 1.3009928594742501,
135
- "rewards/consensus_reward_func": 0.25357142857142856,
136
- "rewards/cumulative_reward_2": 0.0,
137
- "rewards/final_correctness_reward_func": 0.175,
138
- "rewards/question_recreation_reward_func": 0.28507488646677565,
139
- "rewards/soft_format_reward_func": 0.0017857142857142857,
140
- "rewards/strict_format_reward_func": 0.0017857142857142857,
141
- "rewards/xmlcount_reward_func": 0.6455339295523507,
142
- "step": 35
143
  },
144
  {
145
- "completion_length": 229.5390625,
146
- "epoch": 2.366412213740458,
147
- "grad_norm": 1.5192793607711792,
148
- "kl": 0.08173536635003983,
149
  "learning_rate": 4.698684378016222e-07,
150
- "loss": 0.0001,
151
- "reward": 1.9662634998559951,
152
- "reward_std": 2.307476855814457,
153
- "rewards/concensus_correctness_reward_func": 0.6928218763321639,
154
- "rewards/consensus_reward_func": 0.2,
155
- "rewards/cumulative_reward_2": 0.0,
156
- "rewards/final_correctness_reward_func": 0.1625,
157
- "rewards/question_recreation_reward_func": 0.313546329177916,
158
- "rewards/soft_format_reward_func": 0.00078125,
159
- "rewards/strict_format_reward_func": 0.00078125,
160
- "rewards/xmlcount_reward_func": 0.5958328068256378,
161
- "step": 40
162
- },
163
- {
164
- "completion_length": 226.2484375,
165
- "epoch": 2.67175572519084,
166
- "grad_norm": 1.7399648427963257,
167
- "kl": 0.0866468169959262,
168
- "learning_rate": 4.592916195656321e-07,
169
- "loss": 0.0001,
170
- "reward": 2.2245844706892965,
171
- "reward_std": 2.348295548558235,
172
- "rewards/concensus_correctness_reward_func": 0.8688468646258116,
173
- "rewards/consensus_reward_func": 0.21875,
174
- "rewards/cumulative_reward_2": 0.0,
175
- "rewards/final_correctness_reward_func": 0.20625,
176
- "rewards/question_recreation_reward_func": 0.3035891577601433,
177
- "rewards/soft_format_reward_func": 0.0015625,
178
- "rewards/strict_format_reward_func": 0.0015625,
179
- "rewards/xmlcount_reward_func": 0.6240234360098839,
180
- "step": 45
181
- },
182
- {
183
- "completion_length": 229.1890625,
184
- "epoch": 2.9770992366412212,
185
- "grad_norm": 1.76873779296875,
186
- "kl": 0.09108880357816815,
187
- "learning_rate": 4.472851273490984e-07,
188
- "loss": 0.0001,
189
- "reward": 2.0511497393250466,
190
- "reward_std": 1.911637831479311,
191
- "rewards/concensus_correctness_reward_func": 0.6462500024586916,
192
- "rewards/consensus_reward_func": 0.284375,
193
- "rewards/cumulative_reward_2": 0.0,
194
- "rewards/final_correctness_reward_func": 0.196875,
195
- "rewards/question_recreation_reward_func": 0.2953185113146901,
196
- "rewards/soft_format_reward_func": 0.00078125,
197
  "rewards/strict_format_reward_func": 0.0,
198
- "rewards/xmlcount_reward_func": 0.6275500006973743,
199
- "step": 50
200
  },
201
  {
202
- "completion_length": 230.36964285714285,
203
- "epoch": 3.2442748091603053,
204
- "grad_norm": 13.211128234863281,
205
- "kl": 0.09072041969214167,
206
- "learning_rate": 4.339309776682829e-07,
207
- "loss": 0.0001,
208
- "reward": 2.326539208207812,
209
- "reward_std": 2.8010854831763674,
210
- "rewards/concensus_correctness_reward_func": 1.093885723395007,
211
- "rewards/consensus_reward_func": 0.19285714285714287,
212
- "rewards/cumulative_reward_2": 0.0,
213
- "rewards/final_correctness_reward_func": 0.14642857142857144,
214
- "rewards/question_recreation_reward_func": 0.2836945739175592,
215
- "rewards/soft_format_reward_func": 0.0,
216
- "rewards/strict_format_reward_func": 0.0017857142857142857,
217
- "rewards/xmlcount_reward_func": 0.6078875030790056,
218
- "step": 55
219
  },
220
  {
221
- "completion_length": 222.096875,
222
- "epoch": 3.549618320610687,
223
- "grad_norm": 1.6315250396728516,
224
- "kl": 0.08041337295435369,
225
  "learning_rate": 4.193203929064353e-07,
226
- "loss": 0.0001,
227
- "reward": 2.38142469227314,
228
- "reward_std": 2.643636444211006,
229
- "rewards/concensus_correctness_reward_func": 1.0123843706212938,
230
- "rewards/consensus_reward_func": 0.196875,
231
- "rewards/cumulative_reward_2": 0.0,
232
- "rewards/final_correctness_reward_func": 0.228125,
233
- "rewards/question_recreation_reward_func": 0.3176996737718582,
234
- "rewards/soft_format_reward_func": 0.0015625,
235
- "rewards/strict_format_reward_func": 0.00078125,
236
- "rewards/xmlcount_reward_func": 0.623996875435114,
237
- "step": 60
238
- },
239
- {
240
- "completion_length": 227.6765625,
241
- "epoch": 3.854961832061069,
242
- "grad_norm": 1.5087581872940063,
243
- "kl": 0.09321223869919777,
244
- "learning_rate": 4.0355317817241697e-07,
245
- "loss": 0.0001,
246
- "reward": 2.3100090995430946,
247
- "reward_std": 2.631418401747942,
248
- "rewards/concensus_correctness_reward_func": 0.9473375119268894,
249
- "rewards/consensus_reward_func": 0.2375,
250
- "rewards/cumulative_reward_2": 0.0,
251
- "rewards/final_correctness_reward_func": 0.20625,
252
- "rewards/question_recreation_reward_func": 0.29184813760221007,
253
- "rewards/soft_format_reward_func": 0.003125,
254
- "rewards/strict_format_reward_func": 0.0,
255
- "rewards/xmlcount_reward_func": 0.623948446661234,
256
- "step": 65
257
  },
258
  {
259
- "completion_length": 226.1017857142857,
260
- "epoch": 4.122137404580153,
261
- "grad_norm": 1.5481903553009033,
262
- "kl": 0.09283565463764328,
263
  "learning_rate": 3.867370395306068e-07,
264
- "loss": 0.0001,
265
- "reward": 2.0852925266538347,
266
- "reward_std": 2.410001914841788,
267
- "rewards/concensus_correctness_reward_func": 0.7340392986578601,
268
- "rewards/consensus_reward_func": 0.21785714285714286,
269
  "rewards/cumulative_reward_2": 0.0,
270
- "rewards/final_correctness_reward_func": 0.2,
271
- "rewards/question_recreation_reward_func": 0.3157889977097511,
272
  "rewards/soft_format_reward_func": 0.0,
273
- "rewards/strict_format_reward_func": 0.0,
274
- "rewards/xmlcount_reward_func": 0.6176071413925716,
275
- "step": 70
276
- },
277
- {
278
- "completion_length": 224.4625,
279
- "epoch": 4.427480916030534,
280
- "grad_norm": 1.6221083402633667,
281
- "kl": 0.09271430894732476,
282
- "learning_rate": 3.689868482592684e-07,
283
- "loss": 0.0001,
284
- "reward": 2.5825726106762885,
285
- "reward_std": 2.6193617291748525,
286
- "rewards/concensus_correctness_reward_func": 1.1844156241044401,
287
- "rewards/consensus_reward_func": 0.196875,
288
- "rewards/cumulative_reward_2": 0.0,
289
- "rewards/final_correctness_reward_func": 0.228125,
290
- "rewards/question_recreation_reward_func": 0.3128054209053516,
291
- "rewards/soft_format_reward_func": 0.00078125,
292
  "rewards/strict_format_reward_func": 0.0015625,
293
- "rewards/xmlcount_reward_func": 0.6580078117549419,
294
- "step": 75
295
  },
296
  {
297
- "completion_length": 227.3359375,
298
- "epoch": 4.732824427480916,
299
- "grad_norm": 1.699670672416687,
300
- "kl": 0.09714379052165896,
301
  "learning_rate": 3.5042385616324236e-07,
302
- "loss": 0.0001,
303
- "reward": 2.3627020329236985,
304
- "reward_std": 2.499714456498623,
305
- "rewards/concensus_correctness_reward_func": 1.0211906284559518,
306
- "rewards/consensus_reward_func": 0.225,
307
- "rewards/cumulative_reward_2": 0.0,
308
- "rewards/final_correctness_reward_func": 0.209375,
309
- "rewards/question_recreation_reward_func": 0.2893488988280296,
310
  "rewards/soft_format_reward_func": 0.0,
311
- "rewards/strict_format_reward_func": 0.00234375,
312
- "rewards/xmlcount_reward_func": 0.6154437460005283,
313
- "step": 80
314
- },
315
- {
316
- "completion_length": 227.6017857142857,
317
- "epoch": 5.0,
318
- "grad_norm": 0.8229286074638367,
319
- "kl": 0.08785766730351108,
320
- "learning_rate": 3.3117486730117087e-07,
321
- "loss": 0.0001,
322
- "reward": 2.0304165261132376,
323
- "reward_std": 2.2844339413302284,
324
- "rewards/concensus_correctness_reward_func": 0.7656464321272713,
325
- "rewards/consensus_reward_func": 0.2,
326
- "rewards/cumulative_reward_2": 0.0,
327
- "rewards/final_correctness_reward_func": 0.16071428571428573,
328
- "rewards/question_recreation_reward_func": 0.2893593538020338,
329
- "rewards/soft_format_reward_func": 0.0008928571428571428,
330
- "rewards/strict_format_reward_func": 0.0008928571428571428,
331
- "rewards/xmlcount_reward_func": 0.6129107126167842,
332
- "step": 85
333
  },
334
  {
335
- "completion_length": 224.35625,
336
- "epoch": 5.305343511450381,
337
- "grad_norm": 1.8476825952529907,
338
- "kl": 0.09851492333691567,
339
  "learning_rate": 3.1137137178519977e-07,
340
- "loss": 0.0001,
341
- "reward": 2.0146584630012514,
342
- "reward_std": 2.040700928866863,
343
- "rewards/concensus_correctness_reward_func": 0.6759656336158514,
344
- "rewards/consensus_reward_func": 0.228125,
345
- "rewards/cumulative_reward_2": 0.0,
346
- "rewards/final_correctness_reward_func": 0.178125,
347
- "rewards/question_recreation_reward_func": 0.2884428523480892,
348
- "rewards/soft_format_reward_func": 0.00234375,
349
- "rewards/strict_format_reward_func": 0.00234375,
350
- "rewards/xmlcount_reward_func": 0.6393125005066395,
351
- "step": 90
352
- },
353
- {
354
- "completion_length": 228.8078125,
355
- "epoch": 5.6106870229007635,
356
- "grad_norm": 2.1402406692504883,
357
- "kl": 0.09463290963321924,
358
- "learning_rate": 2.911486475701835e-07,
359
- "loss": 0.0001,
360
- "reward": 3.082321289181709,
361
- "reward_std": 3.3287234142422677,
362
- "rewards/concensus_correctness_reward_func": 1.6685624956153333,
363
- "rewards/consensus_reward_func": 0.25625,
364
- "rewards/cumulative_reward_2": 0.0,
365
- "rewards/final_correctness_reward_func": 0.190625,
366
- "rewards/question_recreation_reward_func": 0.3137759966775775,
367
- "rewards/soft_format_reward_func": 0.0015625,
368
- "rewards/strict_format_reward_func": 0.0015625,
369
- "rewards/xmlcount_reward_func": 0.6499828085303306,
370
- "step": 95
371
  },
372
  {
373
- "completion_length": 227.2109375,
374
- "epoch": 5.916030534351145,
375
- "grad_norm": 1.4899626970291138,
376
- "kl": 0.08579296409152448,
377
  "learning_rate": 2.706448363680831e-07,
378
- "loss": 0.0001,
379
- "reward": 2.692727318406105,
380
- "reward_std": 2.8540393710136414,
381
- "rewards/concensus_correctness_reward_func": 1.338937508687377,
382
- "rewards/consensus_reward_func": 0.246875,
383
- "rewards/cumulative_reward_2": 0.0,
384
- "rewards/final_correctness_reward_func": 0.18125,
385
- "rewards/question_recreation_reward_func": 0.28955701310187576,
386
- "rewards/soft_format_reward_func": 0.00078125,
387
- "rewards/strict_format_reward_func": 0.00078125,
388
- "rewards/xmlcount_reward_func": 0.6345453165471554,
389
- "step": 100
390
- },
391
- {
392
- "completion_length": 225.48392857142858,
393
- "epoch": 6.183206106870229,
394
- "grad_norm": 1.5983153581619263,
395
- "kl": 0.08785093038209847,
396
- "learning_rate": 2.5e-07,
397
- "loss": 0.0001,
398
- "reward": 2.0271904604775566,
399
- "reward_std": 2.129315710919244,
400
- "rewards/concensus_correctness_reward_func": 0.6535428636840411,
401
- "rewards/consensus_reward_func": 0.275,
402
- "rewards/cumulative_reward_2": 0.0,
403
- "rewards/final_correctness_reward_func": 0.12142857142857143,
404
- "rewards/question_recreation_reward_func": 0.300606526860169,
405
- "rewards/soft_format_reward_func": 0.0,
406
- "rewards/strict_format_reward_func": 0.0017857142857142857,
407
- "rewards/xmlcount_reward_func": 0.6748267871992929,
408
- "step": 105
409
  },
410
  {
411
- "completion_length": 224.978125,
412
- "epoch": 6.488549618320611,
413
- "grad_norm": 2.0014476776123047,
414
- "kl": 0.10433612796477973,
415
  "learning_rate": 2.2935516363191693e-07,
416
- "loss": 0.0001,
417
- "reward": 2.448861801624298,
418
- "reward_std": 2.524143137037754,
419
- "rewards/concensus_correctness_reward_func": 0.9977124966681004,
420
- "rewards/consensus_reward_func": 0.2625,
421
  "rewards/cumulative_reward_2": 0.0,
422
- "rewards/final_correctness_reward_func": 0.25,
423
- "rewards/question_recreation_reward_func": 0.28643678678199647,
424
- "rewards/soft_format_reward_func": 0.00078125,
425
- "rewards/strict_format_reward_func": 0.00234375,
426
- "rewards/xmlcount_reward_func": 0.649087505787611,
427
- "step": 110
428
- },
429
- {
430
- "completion_length": 223.5515625,
431
- "epoch": 6.793893129770993,
432
- "grad_norm": 1.6419188976287842,
433
- "kl": 0.09386491114273668,
434
- "learning_rate": 2.0885135242981647e-07,
435
- "loss": 0.0001,
436
- "reward": 2.819219100475311,
437
- "reward_std": 3.3134369418025016,
438
- "rewards/concensus_correctness_reward_func": 1.461875000037253,
439
- "rewards/consensus_reward_func": 0.246875,
440
- "rewards/cumulative_reward_2": 0.0,
441
- "rewards/final_correctness_reward_func": 0.203125,
442
- "rewards/question_recreation_reward_func": 0.27337694596499207,
443
  "rewards/soft_format_reward_func": 0.0,
444
- "rewards/strict_format_reward_func": 0.00078125,
445
- "rewards/xmlcount_reward_func": 0.6331859365105629,
446
- "step": 115
447
  },
448
  {
449
- "completion_length": 228.37857142857143,
450
- "epoch": 7.061068702290076,
451
- "grad_norm": 1.7330297231674194,
452
- "kl": 0.09407830823745046,
453
  "learning_rate": 1.886286282148002e-07,
454
- "loss": 0.0001,
455
- "reward": 2.1238247888428825,
456
- "reward_std": 2.135510918072292,
457
- "rewards/concensus_correctness_reward_func": 0.8201964225087847,
458
- "rewards/consensus_reward_func": 0.19285714285714287,
459
- "rewards/cumulative_reward_2": 0.0,
460
- "rewards/final_correctness_reward_func": 0.17142857142857143,
461
- "rewards/question_recreation_reward_func": 0.312046221111502,
462
- "rewards/soft_format_reward_func": 0.0026785714285714286,
463
- "rewards/strict_format_reward_func": 0.0008928571428571428,
464
- "rewards/xmlcount_reward_func": 0.6237249961921147,
465
- "step": 120
466
- },
467
- {
468
- "completion_length": 225.4453125,
469
- "epoch": 7.366412213740458,
470
- "grad_norm": 1.3467189073562622,
471
- "kl": 0.09670781339518726,
472
- "learning_rate": 1.6882513269882913e-07,
473
- "loss": 0.0001,
474
- "reward": 2.9024840116500856,
475
- "reward_std": 3.2552048370242117,
476
- "rewards/concensus_correctness_reward_func": 1.4498093830421568,
477
- "rewards/consensus_reward_func": 0.2375,
478
- "rewards/cumulative_reward_2": 0.0,
479
- "rewards/final_correctness_reward_func": 0.228125,
480
- "rewards/question_recreation_reward_func": 0.3152043020352721,
481
  "rewards/soft_format_reward_func": 0.0,
482
- "rewards/strict_format_reward_func": 0.0015625,
483
- "rewards/xmlcount_reward_func": 0.6702828057110309,
484
- "step": 125
485
  },
486
  {
487
- "completion_length": 225.0,
488
- "epoch": 7.67175572519084,
489
- "grad_norm": 2.1679375171661377,
490
- "kl": 0.08589964711572975,
491
  "learning_rate": 1.4957614383675767e-07,
492
- "loss": 0.0001,
493
- "reward": 1.8035920888185502,
494
- "reward_std": 1.8396185874938964,
495
- "rewards/concensus_correctness_reward_func": 0.47076874738559127,
496
- "rewards/consensus_reward_func": 0.215625,
497
- "rewards/cumulative_reward_2": 0.0,
498
- "rewards/final_correctness_reward_func": 0.1625,
499
- "rewards/question_recreation_reward_func": 0.30079679638147355,
500
- "rewards/soft_format_reward_func": 0.0015625,
501
  "rewards/strict_format_reward_func": 0.0015625,
502
- "rewards/xmlcount_reward_func": 0.6507765665650368,
503
- "step": 130
504
- },
505
- {
506
- "completion_length": 229.65625,
507
- "epoch": 7.977099236641221,
508
- "grad_norm": 2.253136396408081,
509
- "kl": 0.11240638778544962,
510
- "learning_rate": 1.310131517407316e-07,
511
- "loss": 0.0001,
512
- "reward": 2.460201847553253,
513
- "reward_std": 2.5180085256695746,
514
- "rewards/concensus_correctness_reward_func": 1.081900003645569,
515
- "rewards/consensus_reward_func": 0.246875,
516
- "rewards/cumulative_reward_2": 0.0,
517
- "rewards/final_correctness_reward_func": 0.1875,
518
- "rewards/question_recreation_reward_func": 0.29531903751194477,
519
- "rewards/soft_format_reward_func": 0.00078125,
520
- "rewards/strict_format_reward_func": 0.00078125,
521
- "rewards/xmlcount_reward_func": 0.6470453068614006,
522
- "step": 135
523
  },
524
  {
525
- "completion_length": 225.0625,
526
- "epoch": 8.244274809160306,
527
- "grad_norm": 1.5449919700622559,
528
- "kl": 0.08950371891260148,
529
  "learning_rate": 1.1326296046939333e-07,
530
- "loss": 0.0001,
531
- "reward": 2.3891380735806056,
532
- "reward_std": 2.811609878710338,
533
- "rewards/concensus_correctness_reward_func": 1.032021415233612,
534
- "rewards/consensus_reward_func": 0.22142857142857142,
535
- "rewards/cumulative_reward_2": 0.0,
536
- "rewards/final_correctness_reward_func": 0.21071428571428572,
537
- "rewards/question_recreation_reward_func": 0.2915398589202336,
538
  "rewards/soft_format_reward_func": 0.0,
539
- "rewards/strict_format_reward_func": 0.0,
540
- "rewards/xmlcount_reward_func": 0.6334339363234384,
541
- "step": 140
542
- },
543
- {
544
- "completion_length": 224.471875,
545
- "epoch": 8.549618320610687,
546
- "grad_norm": 1.8451206684112549,
547
- "kl": 0.09755032181274145,
548
- "learning_rate": 9.644682182758304e-08,
549
- "loss": 0.0001,
550
- "reward": 2.3509212240576742,
551
- "reward_std": 2.5712964862585066,
552
- "rewards/concensus_correctness_reward_func": 0.9543406262993812,
553
- "rewards/consensus_reward_func": 0.25,
554
- "rewards/cumulative_reward_2": 0.0,
555
- "rewards/final_correctness_reward_func": 0.209375,
556
- "rewards/question_recreation_reward_func": 0.29743685238063333,
557
- "rewards/soft_format_reward_func": 0.00078125,
558
- "rewards/strict_format_reward_func": 0.0,
559
- "rewards/xmlcount_reward_func": 0.638987497985363,
560
- "step": 145
561
  },
562
  {
563
- "completion_length": 229.9328125,
564
- "epoch": 8.854961832061068,
565
- "grad_norm": 1.6817524433135986,
566
- "kl": 0.09376556426286697,
567
  "learning_rate": 8.067960709356478e-08,
568
- "loss": 0.0001,
569
- "reward": 2.199650877714157,
570
- "reward_std": 2.234014268964529,
571
- "rewards/concensus_correctness_reward_func": 0.8738531303592026,
572
- "rewards/consensus_reward_func": 0.215625,
573
- "rewards/cumulative_reward_2": 0.0,
574
- "rewards/final_correctness_reward_func": 0.16875,
575
- "rewards/question_recreation_reward_func": 0.3147571422159672,
576
- "rewards/soft_format_reward_func": 0.00234375,
577
- "rewards/strict_format_reward_func": 0.0015625,
578
- "rewards/xmlcount_reward_func": 0.6227593772113323,
579
- "step": 150
580
- },
581
- {
582
- "completion_length": 227.23392857142858,
583
- "epoch": 9.122137404580153,
584
- "grad_norm": 1.5826033353805542,
585
- "kl": 0.10102412780480725,
586
- "learning_rate": 6.60690223317171e-08,
587
- "loss": 0.0001,
588
- "reward": 2.717519346305302,
589
- "reward_std": 2.7952321265425,
590
- "rewards/concensus_correctness_reward_func": 1.244885718183858,
591
- "rewards/consensus_reward_func": 0.25,
592
- "rewards/cumulative_reward_2": 0.0,
593
- "rewards/final_correctness_reward_func": 0.24642857142857144,
594
- "rewards/question_recreation_reward_func": 0.30728720192398346,
595
- "rewards/soft_format_reward_func": 0.0,
596
  "rewards/strict_format_reward_func": 0.0,
597
- "rewards/xmlcount_reward_func": 0.6689178551946368,
598
- "step": 155
599
  },
600
  {
601
- "completion_length": 230.4765625,
602
- "epoch": 9.427480916030534,
603
- "grad_norm": 1.7758827209472656,
604
- "kl": 0.09566058879718184,
605
  "learning_rate": 5.271487265090163e-08,
606
- "loss": 0.0001,
607
- "reward": 2.054090151935816,
608
- "reward_std": 2.34877013489604,
609
- "rewards/concensus_correctness_reward_func": 0.8258124882355332,
610
- "rewards/consensus_reward_func": 0.25625,
611
- "rewards/cumulative_reward_2": 0.0,
612
- "rewards/final_correctness_reward_func": 0.08125,
613
- "rewards/question_recreation_reward_func": 0.2795291979797184,
614
- "rewards/soft_format_reward_func": 0.00234375,
615
- "rewards/strict_format_reward_func": 0.0015625,
616
- "rewards/xmlcount_reward_func": 0.6073421865701676,
617
- "step": 160
618
- },
619
- {
620
- "completion_length": 229.909375,
621
- "epoch": 9.732824427480915,
622
- "grad_norm": 1.4019196033477783,
623
- "kl": 0.10127251031808555,
624
- "learning_rate": 4.0708380434367864e-08,
625
- "loss": 0.0001,
626
- "reward": 2.4125003829598426,
627
- "reward_std": 2.4047275736927984,
628
- "rewards/concensus_correctness_reward_func": 1.0241687575355172,
629
- "rewards/consensus_reward_func": 0.265625,
630
- "rewards/cumulative_reward_2": 0.0,
631
- "rewards/final_correctness_reward_func": 0.16875,
632
- "rewards/question_recreation_reward_func": 0.31353163830935954,
633
- "rewards/soft_format_reward_func": 0.0015625,
634
- "rewards/strict_format_reward_func": 0.0015625,
635
- "rewards/xmlcount_reward_func": 0.6372999995946884,
636
- "step": 165
637
  },
638
  {
639
- "completion_length": 227.7017857142857,
640
- "epoch": 10.0,
641
- "grad_norm": 0.6785824298858643,
642
- "kl": 0.08975713018860136,
643
  "learning_rate": 3.013156219837776e-08,
644
- "loss": 0.0001,
645
- "reward": 1.9554564390863691,
646
- "reward_std": 2.0454944218908038,
647
- "rewards/concensus_correctness_reward_func": 0.5938928590289184,
648
- "rewards/consensus_reward_func": 0.22857142857142856,
649
- "rewards/cumulative_reward_2": 0.0,
650
- "rewards/final_correctness_reward_func": 0.25,
651
- "rewards/question_recreation_reward_func": 0.27023680912596837,
652
- "rewards/soft_format_reward_func": 0.0008928571428571428,
653
- "rewards/strict_format_reward_func": 0.0008928571428571428,
654
- "rewards/xmlcount_reward_func": 0.6109696456364223,
655
- "step": 170
656
- },
657
- {
658
- "completion_length": 229.1828125,
659
- "epoch": 10.305343511450381,
660
- "grad_norm": 1.5233851671218872,
661
- "kl": 0.08713910647202283,
662
- "learning_rate": 2.1056668336235623e-08,
663
- "loss": 0.0001,
664
- "reward": 1.773926168680191,
665
- "reward_std": 1.8827015846967696,
666
- "rewards/concensus_correctness_reward_func": 0.5077062356285751,
667
- "rewards/consensus_reward_func": 0.20625,
668
- "rewards/cumulative_reward_2": 0.0,
669
- "rewards/final_correctness_reward_func": 0.175,
670
- "rewards/question_recreation_reward_func": 0.2668996132910252,
671
- "rewards/soft_format_reward_func": 0.0015625,
672
- "rewards/strict_format_reward_func": 0.0015625,
673
- "rewards/xmlcount_reward_func": 0.6149453103542328,
674
- "step": 175
675
- },
676
- {
677
- "completion_length": 224.91875,
678
- "epoch": 10.610687022900763,
679
- "grad_norm": 1.5735443830490112,
680
- "kl": 0.09181699948385358,
681
- "learning_rate": 1.3545689574841341e-08,
682
- "loss": 0.0001,
683
- "reward": 2.5688821971416473,
684
- "reward_std": 3.0575607106089593,
685
- "rewards/concensus_correctness_reward_func": 1.2135156329721213,
686
- "rewards/consensus_reward_func": 0.209375,
687
- "rewards/cumulative_reward_2": 0.0,
688
- "rewards/final_correctness_reward_func": 0.215625,
689
- "rewards/question_recreation_reward_func": 0.3163775021210313,
690
  "rewards/soft_format_reward_func": 0.0,
691
- "rewards/strict_format_reward_func": 0.0,
692
- "rewards/xmlcount_reward_func": 0.6139890596270561,
693
- "step": 180
694
  },
695
  {
696
- "completion_length": 227.946875,
697
- "epoch": 10.916030534351146,
698
- "grad_norm": 1.6556141376495361,
699
- "kl": 0.11013491982594133,
700
- "learning_rate": 7.649933515167406e-09,
701
- "loss": 0.0001,
702
- "reward": 2.8750985130667686,
703
- "reward_std": 3.1339681535959243,
704
- "rewards/concensus_correctness_reward_func": 1.4591718828538434,
705
- "rewards/consensus_reward_func": 0.265625,
706
- "rewards/cumulative_reward_2": 0.0,
707
- "rewards/final_correctness_reward_func": 0.215625,
708
- "rewards/question_recreation_reward_func": 0.30609070267528293,
709
- "rewards/soft_format_reward_func": 0.0,
710
- "rewards/strict_format_reward_func": 0.0,
711
- "rewards/xmlcount_reward_func": 0.6285859420895576,
712
- "step": 185
713
  },
714
  {
715
- "completion_length": 222.18392857142857,
716
- "epoch": 11.183206106870228,
717
- "grad_norm": 1.699262022972107,
718
- "kl": 0.1069457121193409,
719
  "learning_rate": 3.4096741493194193e-09,
720
- "loss": 0.0001,
721
- "reward": 2.5891014899526326,
722
- "reward_std": 2.7870528187070573,
723
- "rewards/concensus_correctness_reward_func": 1.15280357918569,
724
- "rewards/consensus_reward_func": 0.24285714285714285,
725
- "rewards/cumulative_reward_2": 0.0,
726
- "rewards/final_correctness_reward_func": 0.24642857142857144,
727
- "rewards/question_recreation_reward_func": 0.28257290316479544,
728
- "rewards/soft_format_reward_func": 0.0008928571428571428,
729
- "rewards/strict_format_reward_func": 0.0008928571428571428,
730
- "rewards/xmlcount_reward_func": 0.662653568812779,
731
- "step": 190
732
- },
733
- {
734
- "completion_length": 227.6078125,
735
- "epoch": 11.488549618320612,
736
- "grad_norm": 2.025723695755005,
737
- "kl": 0.09346012743189931,
738
- "learning_rate": 8.538767483325383e-10,
739
- "loss": 0.0001,
740
- "reward": 2.5617597877979277,
741
- "reward_std": 2.8330461889505387,
742
- "rewards/concensus_correctness_reward_func": 1.2098406325094402,
743
- "rewards/consensus_reward_func": 0.178125,
744
  "rewards/cumulative_reward_2": 0.0,
745
- "rewards/final_correctness_reward_func": 0.246875,
746
- "rewards/question_recreation_reward_func": 0.2995644178241491,
747
  "rewards/soft_format_reward_func": 0.0,
748
- "rewards/strict_format_reward_func": 0.00078125,
749
- "rewards/xmlcount_reward_func": 0.6265734456479549,
750
- "step": 195
751
  },
752
  {
753
- "completion_length": 226.3453125,
754
- "epoch": 11.793893129770993,
755
- "grad_norm": 1.561265230178833,
756
- "kl": 0.09566104216501117,
757
  "learning_rate": 0.0,
758
- "loss": 0.0001,
759
- "reward": 2.1840140745043755,
760
- "reward_std": 2.114570437371731,
761
- "rewards/concensus_correctness_reward_func": 0.9061656036414206,
762
- "rewards/consensus_reward_func": 0.228125,
763
- "rewards/cumulative_reward_2": 0.0,
764
- "rewards/final_correctness_reward_func": 0.115625,
765
- "rewards/question_recreation_reward_func": 0.3060203332453966,
766
  "rewards/soft_format_reward_func": 0.0,
767
- "rewards/strict_format_reward_func": 0.0015625,
768
- "rewards/xmlcount_reward_func": 0.626515619456768,
769
- "step": 200
770
  },
771
  {
772
- "epoch": 11.793893129770993,
773
- "step": 200,
774
  "total_flos": 0.0,
775
- "train_loss": 8.942109707277268e-05,
776
- "train_runtime": 4099.4474,
777
- "train_samples_per_second": 6.245,
778
- "train_steps_per_second": 0.049
779
  }
780
  ],
781
  "logging_steps": 5,
782
- "max_steps": 200,
783
  "num_input_tokens_seen": 0,
784
- "num_train_epochs": 13,
785
  "save_steps": 50,
786
  "stateful_callbacks": {
787
  "TrainerControl": {
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 16.761904761904763,
5
  "eval_steps": 500,
6
+ "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "completion_length": 214.846875,
13
+ "epoch": 0.9523809523809523,
14
+ "grad_norm": 4.286362171173096,
15
+ "kl": 0.007153257649042644,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  "learning_rate": 5e-07,
17
+ "loss": 0.0,
18
+ "reward": 0.9013831291347743,
19
+ "reward_std": 0.9495122246444225,
20
+ "rewards/concensus_correctness_reward_func": 0.07926875045523048,
21
+ "rewards/consensus_reward_func": 0.05,
22
+ "rewards/cumulative_reward_2": 0.0,
23
+ "rewards/final_correctness_reward_func": 0.165625,
24
+ "rewards/question_recreation_reward_func": 0.2006971864029765,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  "rewards/soft_format_reward_func": 0.00078125,
26
+ "rewards/strict_format_reward_func": 0.00234375,
27
+ "rewards/xmlcount_reward_func": 0.40266719218343494,
28
+ "step": 5
29
  },
30
  {
31
+ "completion_length": 217.63970588235293,
32
+ "epoch": 1.7619047619047619,
33
+ "grad_norm": 11.65625,
34
+ "kl": 0.003827589317498838,
35
  "learning_rate": 4.965903258506806e-07,
36
+ "loss": 0.0,
37
+ "reward": 0.9087017599274131,
38
+ "reward_std": 0.937035866519984,
39
+ "rewards/concensus_correctness_reward_func": 0.11116911733851713,
40
+ "rewards/consensus_reward_func": 0.04044117647058824,
41
+ "rewards/cumulative_reward_2": 0.0,
42
+ "rewards/final_correctness_reward_func": 0.1801470588235294,
43
+ "rewards/question_recreation_reward_func": 0.21326609646134517,
44
+ "rewards/soft_format_reward_func": 0.0009191176470588235,
45
+ "rewards/strict_format_reward_func": 0.0,
46
+ "rewards/xmlcount_reward_func": 0.36275919000892076,
47
+ "step": 10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  },
49
  {
50
+ "completion_length": 215.58455882352942,
51
+ "epoch": 2.571428571428571,
52
+ "grad_norm": 4.602188587188721,
53
+ "kl": 0.007200981949127334,
54
  "learning_rate": 4.864543104251586e-07,
55
+ "loss": 0.0,
56
+ "reward": 0.8416915027534261,
57
+ "reward_std": 0.8690077452098622,
58
+ "rewards/concensus_correctness_reward_func": 0.060602940838126576,
59
+ "rewards/consensus_reward_func": 0.04044117647058824,
60
+ "rewards/cumulative_reward_2": 0.0,
61
+ "rewards/final_correctness_reward_func": 0.15073529411764705,
62
+ "rewards/question_recreation_reward_func": 0.22436061611070351,
63
+ "rewards/soft_format_reward_func": 0.0,
64
+ "rewards/strict_format_reward_func": 0.0,
65
+ "rewards/xmlcount_reward_func": 0.3655514717102051,
66
+ "step": 15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  },
68
  {
69
+ "completion_length": 217.41176470588235,
70
+ "epoch": 3.380952380952381,
71
+ "grad_norm": 3.5898094177246094,
72
+ "kl": 0.006835748431245413,
73
  "learning_rate": 4.698684378016222e-07,
74
+ "loss": 0.0,
75
+ "reward": 0.8475531142424134,
76
+ "reward_std": 0.9209246758152457,
77
+ "rewards/concensus_correctness_reward_func": 0.09411764736561214,
78
+ "rewards/consensus_reward_func": 0.025735294117647058,
79
+ "rewards/cumulative_reward_2": 0.0,
80
+ "rewards/final_correctness_reward_func": 0.16544117647058823,
81
+ "rewards/question_recreation_reward_func": 0.21184356072369745,
82
+ "rewards/soft_format_reward_func": 0.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  "rewards/strict_format_reward_func": 0.0,
84
+ "rewards/xmlcount_reward_func": 0.3504154397284283,
85
+ "step": 20
86
  },
87
  {
88
+ "completion_length": 217.75551470588235,
89
+ "epoch": 4.190476190476191,
90
+ "grad_norm": 4.1127400398254395,
91
+ "kl": 0.004183680955868433,
92
+ "learning_rate": 4.472851273490984e-07,
93
+ "loss": 0.0,
94
+ "reward": 0.8926854492986903,
95
+ "reward_std": 0.9771755057222703,
96
+ "rewards/concensus_correctness_reward_func": 0.09619853119639789,
97
+ "rewards/consensus_reward_func": 0.0661764705882353,
98
+ "rewards/cumulative_reward_2": 0.0,
99
+ "rewards/final_correctness_reward_func": 0.14705882352941177,
100
+ "rewards/question_recreation_reward_func": 0.19953655747368054,
101
+ "rewards/soft_format_reward_func": 0.001838235294117647,
102
+ "rewards/strict_format_reward_func": 0.0009191176470588235,
103
+ "rewards/xmlcount_reward_func": 0.3809577248552266,
104
+ "step": 25
105
  },
106
  {
107
+ "completion_length": 218.11580882352942,
108
+ "epoch": 5.0,
109
+ "grad_norm": 1.4711809158325195,
110
+ "kl": 0.004230352046707755,
111
  "learning_rate": 4.193203929064353e-07,
112
+ "loss": 0.0,
113
+ "reward": 0.9753129236838397,
114
+ "reward_std": 0.9230635635116521,
115
+ "rewards/concensus_correctness_reward_func": 0.09226102951694937,
116
+ "rewards/consensus_reward_func": 0.04411764705882353,
117
+ "rewards/cumulative_reward_2": 0.0,
118
+ "rewards/final_correctness_reward_func": 0.17647058823529413,
119
+ "rewards/question_recreation_reward_func": 0.23682946558384335,
120
+ "rewards/soft_format_reward_func": 0.0,
121
+ "rewards/strict_format_reward_func": 0.0009191176470588235,
122
+ "rewards/xmlcount_reward_func": 0.4247150727931191,
123
+ "step": 30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  },
125
  {
126
+ "completion_length": 219.0828125,
127
+ "epoch": 5.9523809523809526,
128
+ "grad_norm": 9.926897048950195,
129
+ "kl": 0.00585453501844313,
130
  "learning_rate": 3.867370395306068e-07,
131
+ "loss": 0.0,
132
+ "reward": 1.0385151661932468,
133
+ "reward_std": 1.107630380988121,
134
+ "rewards/concensus_correctness_reward_func": 0.19590625064447523,
135
+ "rewards/consensus_reward_func": 0.059375,
136
  "rewards/cumulative_reward_2": 0.0,
137
+ "rewards/final_correctness_reward_func": 0.16875,
138
+ "rewards/question_recreation_reward_func": 0.23067768132314087,
139
  "rewards/soft_format_reward_func": 0.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  "rewards/strict_format_reward_func": 0.0015625,
141
+ "rewards/xmlcount_reward_func": 0.382243749499321,
142
+ "step": 35
143
  },
144
  {
145
+ "completion_length": 217.82169117647058,
146
+ "epoch": 6.761904761904762,
147
+ "grad_norm": 4.228250026702881,
148
+ "kl": 0.004935655852451044,
149
  "learning_rate": 3.5042385616324236e-07,
150
+ "loss": 0.0,
151
+ "reward": 0.971110563067829,
152
+ "reward_std": 0.9750440252177855,
153
+ "rewards/concensus_correctness_reward_func": 0.1082463244743207,
154
+ "rewards/consensus_reward_func": 0.04779411764705882,
155
+ "rewards/cumulative_reward_2": 0.0,
156
+ "rewards/final_correctness_reward_func": 0.18382352941176472,
157
+ "rewards/question_recreation_reward_func": 0.2439561676891411,
158
  "rewards/soft_format_reward_func": 0.0,
159
+ "rewards/strict_format_reward_func": 0.0009191176470588235,
160
+ "rewards/xmlcount_reward_func": 0.38637132504407096,
161
+ "step": 40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  },
163
  {
164
+ "completion_length": 216.6672794117647,
165
+ "epoch": 7.571428571428571,
166
+ "grad_norm": 11.885100364685059,
167
+ "kl": 0.007807457162176862,
168
  "learning_rate": 3.1137137178519977e-07,
169
+ "loss": 0.0,
170
+ "reward": 0.9136327408692416,
171
+ "reward_std": 0.9482370290686103,
172
+ "rewards/concensus_correctness_reward_func": 0.09872794173219625,
173
+ "rewards/consensus_reward_func": 0.05514705882352941,
174
+ "rewards/cumulative_reward_2": 0.0,
175
+ "rewards/final_correctness_reward_func": 0.15808823529411764,
176
+ "rewards/question_recreation_reward_func": 0.2349783167900408,
177
+ "rewards/soft_format_reward_func": 0.001838235294117647,
178
+ "rewards/strict_format_reward_func": 0.0,
179
+ "rewards/xmlcount_reward_func": 0.36485293463749047,
180
+ "step": 45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  },
182
  {
183
+ "completion_length": 215.69301470588235,
184
+ "epoch": 8.380952380952381,
185
+ "grad_norm": 8.040736198425293,
186
+ "kl": 0.030681811523048535,
187
  "learning_rate": 2.706448363680831e-07,
188
+ "loss": 0.0,
189
+ "reward": 0.9279013907208162,
190
+ "reward_std": 0.9109513680724537,
191
+ "rewards/concensus_correctness_reward_func": 0.08287132454707342,
192
+ "rewards/consensus_reward_func": 0.05514705882352941,
193
+ "rewards/cumulative_reward_2": 0.0,
194
+ "rewards/final_correctness_reward_func": 0.17647058823529413,
195
+ "rewards/question_recreation_reward_func": 0.2283407300710678,
196
+ "rewards/soft_format_reward_func": 0.0009191176470588235,
197
+ "rewards/strict_format_reward_func": 0.0009191176470588235,
198
+ "rewards/xmlcount_reward_func": 0.3832334569271873,
199
+ "step": 50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  },
201
  {
202
+ "completion_length": 219.5183823529412,
203
+ "epoch": 9.19047619047619,
204
+ "grad_norm": 6.5597825050354,
205
+ "kl": 0.004213525095324525,
206
  "learning_rate": 2.2935516363191693e-07,
207
+ "loss": 0.0,
208
+ "reward": 0.8882785693687552,
209
+ "reward_std": 0.9178478770396289,
210
+ "rewards/concensus_correctness_reward_func": 0.07434191127472065,
211
+ "rewards/consensus_reward_func": 0.04411764705882353,
212
  "rewards/cumulative_reward_2": 0.0,
213
+ "rewards/final_correctness_reward_func": 0.1875,
214
+ "rewards/question_recreation_reward_func": 0.22048261701403296,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  "rewards/soft_format_reward_func": 0.0,
216
+ "rewards/strict_format_reward_func": 0.0009191176470588235,
217
+ "rewards/xmlcount_reward_func": 0.3609172793872216,
218
+ "step": 55
219
  },
220
  {
221
+ "completion_length": 216.66176470588235,
222
+ "epoch": 10.0,
223
+ "grad_norm": 1.862962007522583,
224
+ "kl": 0.002967422687695088,
225
  "learning_rate": 1.886286282148002e-07,
226
+ "loss": 0.0,
227
+ "reward": 0.8225221322739825,
228
+ "reward_std": 0.8722858481547412,
229
+ "rewards/concensus_correctness_reward_func": 0.07955514683442957,
230
+ "rewards/consensus_reward_func": 0.04779411764705882,
231
+ "rewards/cumulative_reward_2": 0.0,
232
+ "rewards/final_correctness_reward_func": 0.15073529411764705,
233
+ "rewards/question_recreation_reward_func": 0.18901477447327444,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  "rewards/soft_format_reward_func": 0.0,
235
+ "rewards/strict_format_reward_func": 0.0,
236
+ "rewards/xmlcount_reward_func": 0.3554227948188782,
237
+ "step": 60
238
  },
239
  {
240
+ "completion_length": 215.9703125,
241
+ "epoch": 10.952380952380953,
242
+ "grad_norm": 4.381515026092529,
243
+ "kl": 0.004608349371119402,
244
  "learning_rate": 1.4957614383675767e-07,
245
+ "loss": 0.0,
246
+ "reward": 0.9207330733537674,
247
+ "reward_std": 0.8800692193210125,
248
+ "rewards/concensus_correctness_reward_func": 0.07503749821335078,
249
+ "rewards/consensus_reward_func": 0.03125,
250
+ "rewards/cumulative_reward_2": 0.0,
251
+ "rewards/final_correctness_reward_func": 0.159375,
252
+ "rewards/question_recreation_reward_func": 0.247809642739594,
253
+ "rewards/soft_format_reward_func": 0.0,
254
  "rewards/strict_format_reward_func": 0.0015625,
255
+ "rewards/xmlcount_reward_func": 0.40569843612611295,
256
+ "step": 65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
  },
258
  {
259
+ "completion_length": 217.2058823529412,
260
+ "epoch": 11.761904761904763,
261
+ "grad_norm": 3.9310142993927,
262
+ "kl": 0.0031400311820428163,
263
  "learning_rate": 1.1326296046939333e-07,
264
+ "loss": 0.0,
265
+ "reward": 0.9569352628553615,
266
+ "reward_std": 0.9295545737533009,
267
+ "rewards/concensus_correctness_reward_func": 0.08112867603845456,
268
+ "rewards/consensus_reward_func": 0.04779411764705882,
269
+ "rewards/cumulative_reward_2": 0.0,
270
+ "rewards/final_correctness_reward_func": 0.15073529411764705,
271
+ "rewards/question_recreation_reward_func": 0.25045547919238315,
272
  "rewards/soft_format_reward_func": 0.0,
273
+ "rewards/strict_format_reward_func": 0.0009191176470588235,
274
+ "rewards/xmlcount_reward_func": 0.42590257974670215,
275
+ "step": 70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
  },
277
  {
278
+ "completion_length": 217.875,
279
+ "epoch": 12.571428571428571,
280
+ "grad_norm": 3.102419376373291,
281
+ "kl": 0.0023113159010844197,
282
  "learning_rate": 8.067960709356478e-08,
283
+ "loss": 0.0,
284
+ "reward": 0.8187810673433191,
285
+ "reward_std": 0.879577760310734,
286
+ "rewards/concensus_correctness_reward_func": 0.0920624991550165,
287
+ "rewards/consensus_reward_func": 0.025735294117647058,
288
+ "rewards/cumulative_reward_2": 0.0,
289
+ "rewards/final_correctness_reward_func": 0.13602941176470587,
290
+ "rewards/question_recreation_reward_func": 0.20141525446053812,
291
+ "rewards/soft_format_reward_func": 0.0009191176470588235,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
  "rewards/strict_format_reward_func": 0.0,
293
+ "rewards/xmlcount_reward_func": 0.3626194854869562,
294
+ "step": 75
295
  },
296
  {
297
+ "completion_length": 221.18014705882354,
298
+ "epoch": 13.380952380952381,
299
+ "grad_norm": 4.012791633605957,
300
+ "kl": 0.005384887541260789,
301
  "learning_rate": 5.271487265090163e-08,
302
+ "loss": 0.0,
303
+ "reward": 0.8298134128837025,
304
+ "reward_std": 0.8251312436426387,
305
+ "rewards/concensus_correctness_reward_func": 0.07310661739286255,
306
+ "rewards/consensus_reward_func": 0.022058823529411766,
307
+ "rewards/cumulative_reward_2": 0.0,
308
+ "rewards/final_correctness_reward_func": 0.1801470588235294,
309
+ "rewards/question_recreation_reward_func": 0.2030891538323725,
310
+ "rewards/soft_format_reward_func": 0.0009191176470588235,
311
+ "rewards/strict_format_reward_func": 0.0009191176470588235,
312
+ "rewards/xmlcount_reward_func": 0.3495735298184788,
313
+ "step": 80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
  },
315
  {
316
+ "completion_length": 219.46691176470588,
317
+ "epoch": 14.19047619047619,
318
+ "grad_norm": 2.952268123626709,
319
+ "kl": 0.0029621824026381705,
320
  "learning_rate": 3.013156219837776e-08,
321
+ "loss": 0.0,
322
+ "reward": 0.8352440113530439,
323
+ "reward_std": 0.8148685544729233,
324
+ "rewards/concensus_correctness_reward_func": 0.08091911651632365,
325
+ "rewards/consensus_reward_func": 0.04044117647058824,
326
+ "rewards/cumulative_reward_2": 0.0,
327
+ "rewards/final_correctness_reward_func": 0.125,
328
+ "rewards/question_recreation_reward_func": 0.21936532570158734,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
329
  "rewards/soft_format_reward_func": 0.0,
330
+ "rewards/strict_format_reward_func": 0.001838235294117647,
331
+ "rewards/xmlcount_reward_func": 0.3676801420309964,
332
+ "step": 85
333
  },
334
  {
335
+ "completion_length": 219.0514705882353,
336
+ "epoch": 15.0,
337
+ "grad_norm": 2.2386655807495117,
338
+ "kl": 0.003243645081323955,
339
+ "learning_rate": 1.3545689574841341e-08,
340
+ "loss": 0.0,
341
+ "reward": 0.865555027828497,
342
+ "reward_std": 0.847034204093849,
343
+ "rewards/concensus_correctness_reward_func": 0.06844852908569224,
344
+ "rewards/consensus_reward_func": 0.04411764705882353,
345
+ "rewards/cumulative_reward_2": 0.0,
346
+ "rewards/final_correctness_reward_func": 0.14705882352941177,
347
+ "rewards/question_recreation_reward_func": 0.2121359138366054,
348
+ "rewards/soft_format_reward_func": 0.0009191176470588235,
349
+ "rewards/strict_format_reward_func": 0.0009191176470588235,
350
+ "rewards/xmlcount_reward_func": 0.3919558823108673,
351
+ "step": 90
352
  },
353
  {
354
+ "completion_length": 221.35,
355
+ "epoch": 15.952380952380953,
356
+ "grad_norm": 5.4549055099487305,
357
+ "kl": 0.005640207437681966,
358
  "learning_rate": 3.4096741493194193e-09,
359
+ "loss": 0.0,
360
+ "reward": 0.977014084905386,
361
+ "reward_std": 1.0592330724000931,
362
+ "rewards/concensus_correctness_reward_func": 0.14016250055283308,
363
+ "rewards/consensus_reward_func": 0.05,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364
  "rewards/cumulative_reward_2": 0.0,
365
+ "rewards/final_correctness_reward_func": 0.18125,
366
+ "rewards/question_recreation_reward_func": 0.22033127043396233,
367
  "rewards/soft_format_reward_func": 0.0,
368
+ "rewards/strict_format_reward_func": 0.0,
369
+ "rewards/xmlcount_reward_func": 0.38527031214907764,
370
+ "step": 95
371
  },
372
  {
373
+ "completion_length": 218.4246323529412,
374
+ "epoch": 16.761904761904763,
375
+ "grad_norm": 4.802547931671143,
376
+ "kl": 0.003235554914948914,
377
  "learning_rate": 0.0,
378
+ "loss": 0.0,
379
+ "reward": 0.8579355530879077,
380
+ "reward_std": 0.9595372615491643,
381
+ "rewards/concensus_correctness_reward_func": 0.10365073623902657,
382
+ "rewards/consensus_reward_func": 0.025735294117647058,
383
+ "rewards/cumulative_reward_2": 0.0,
384
+ "rewards/final_correctness_reward_func": 0.13970588235294118,
385
+ "rewards/question_recreation_reward_func": 0.21492820759029949,
386
  "rewards/soft_format_reward_func": 0.0,
387
+ "rewards/strict_format_reward_func": 0.0,
388
+ "rewards/xmlcount_reward_func": 0.3739154419916518,
389
+ "step": 100
390
  },
391
  {
392
+ "epoch": 16.761904761904763,
393
+ "step": 100,
394
  "total_flos": 0.0,
395
+ "train_loss": 5.2903332107234745e-06,
396
+ "train_runtime": 1863.6299,
397
+ "train_samples_per_second": 6.868,
398
+ "train_steps_per_second": 0.054
399
  }
400
  ],
401
  "logging_steps": 5,
402
+ "max_steps": 100,
403
  "num_input_tokens_seen": 0,
404
+ "num_train_epochs": 20,
405
  "save_steps": 50,
406
  "stateful_callbacks": {
407
  "TrainerControl": {
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:05ed80a22ff9c829edb7a4c8fd9cc4182f39df7c3e1c90d15f1ebf2fb8d1778c
3
  size 5944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f61e7cd63625c52f24373a35fc8911f78375028dddc9c9ee533d76f160cb57ec
3
  size 5944