Avokado777 commited on
Commit
a297e51
·
verified ·
1 Parent(s): 1ba4748

End of training

Browse files
Files changed (4) hide show
  1. all_results.json +5 -5
  2. model.safetensors +1 -1
  3. train_results.json +5 -5
  4. trainer_state.json +232 -232
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": -0.00011151888082939157,
4
- "train_runtime": 1866.5324,
5
- "train_samples": 3,
6
- "train_samples_per_second": 0.043,
7
- "train_steps_per_second": 0.011
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": -0.003300715550648903,
4
+ "train_runtime": 932.3793,
5
+ "train_samples": 5,
6
+ "train_samples_per_second": 0.086,
7
+ "train_steps_per_second": 0.021
8
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:63073bffd65cd38e2813613d2c79a4a61e3a6b4d1f1c85be273e56e884cfd0c3
3
  size 1976163472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd74aabc43044b399b3fdbb155c26bd99656d58a8083aa93eee67aa286664299
3
  size 1976163472
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": -0.00011151888082939157,
4
- "train_runtime": 1866.5324,
5
- "train_samples": 3,
6
- "train_samples_per_second": 0.043,
7
- "train_steps_per_second": 0.011
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": -0.003300715550648903,
4
+ "train_runtime": 932.3793,
5
+ "train_samples": 5,
6
+ "train_samples_per_second": 0.086,
7
+ "train_steps_per_second": 0.021
8
  }
trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 19.666666666666668,
6
  "eval_steps": 500,
7
  "global_step": 20,
8
  "is_hyper_param_search": false,
@@ -16,36 +16,36 @@
16
  "clip_ratio/low_min": 0.0,
17
  "clip_ratio/region_mean": 0.0,
18
  "completions/clipped_ratio": 0.0,
19
- "completions/max_length": 113.0,
20
- "completions/max_terminated_length": 113.0,
21
- "completions/mean_length": 105.16666793823242,
22
- "completions/mean_terminated_length": 105.16666793823242,
23
- "completions/min_length": 89.5,
24
- "completions/min_terminated_length": 89.5,
25
- "epoch": 1.6666666666666665,
26
  "grad_norm": 0.0,
27
  "kl": 0.0,
28
  "learning_rate": 5e-07,
29
  "loss": 0.0,
30
- "num_tokens": 2858.0,
31
- "reward": 8.479824781417847,
32
  "reward_std": 0.0,
33
- "rewards/concensus_correctness_reward_func/mean": 2.9359999895095825,
34
- "rewards/concensus_correctness_reward_func/std": 1.1304517984390259,
35
- "rewards/consensus_reward_func/mean": 1.5,
36
- "rewards/consensus_reward_func/std": 0.5773502588272095,
37
  "rewards/cumulative_reward_2/mean": 0.0,
38
  "rewards/cumulative_reward_2/std": 0.0,
39
- "rewards/final_correctness_reward_func/mean": 2.0,
40
  "rewards/final_correctness_reward_func/std": 0.0,
41
- "rewards/question_recreation_reward_func/mean": 0.4813254624605179,
42
- "rewards/question_recreation_reward_func/std": 0.4909558594226837,
43
  "rewards/soft_format_reward_func/mean": 0.0,
44
  "rewards/soft_format_reward_func/std": 0.0,
45
- "rewards/strict_format_reward_func/mean": 0.375,
46
- "rewards/strict_format_reward_func/std": 0.14433756470680237,
47
- "rewards/xmlcount_reward_func/mean": 1.1875,
48
- "rewards/xmlcount_reward_func/std": 0.07216878235340118,
49
  "step": 2
50
  },
51
  {
@@ -55,36 +55,36 @@
55
  "clip_ratio/low_min": 0.0,
56
  "clip_ratio/region_mean": 0.0,
57
  "completions/clipped_ratio": 0.0,
58
- "completions/max_length": 113.0,
59
- "completions/max_terminated_length": 113.0,
60
- "completions/mean_length": 105.16666793823242,
61
- "completions/mean_terminated_length": 105.16666793823242,
62
- "completions/min_length": 89.5,
63
- "completions/min_terminated_length": 89.5,
64
- "epoch": 3.6666666666666665,
65
  "grad_norm": 0.0,
66
  "kl": 0.0,
67
  "learning_rate": 4.864543104251586e-07,
68
  "loss": 0.0,
69
- "num_tokens": 5716.0,
70
- "reward": 8.479824781417847,
71
  "reward_std": 0.0,
72
- "rewards/concensus_correctness_reward_func/mean": 2.9359999895095825,
73
- "rewards/concensus_correctness_reward_func/std": 1.1304517984390259,
74
- "rewards/consensus_reward_func/mean": 1.5,
75
- "rewards/consensus_reward_func/std": 0.5773502588272095,
76
  "rewards/cumulative_reward_2/mean": 0.0,
77
  "rewards/cumulative_reward_2/std": 0.0,
78
- "rewards/final_correctness_reward_func/mean": 2.0,
79
- "rewards/final_correctness_reward_func/std": 0.0,
80
- "rewards/question_recreation_reward_func/mean": 0.4813254624605179,
81
- "rewards/question_recreation_reward_func/std": 0.4909558594226837,
82
  "rewards/soft_format_reward_func/mean": 0.0,
83
  "rewards/soft_format_reward_func/std": 0.0,
84
- "rewards/strict_format_reward_func/mean": 0.375,
85
- "rewards/strict_format_reward_func/std": 0.14433756470680237,
86
- "rewards/xmlcount_reward_func/mean": 1.1875,
87
- "rewards/xmlcount_reward_func/std": 0.07216878235340118,
88
  "step": 4
89
  },
90
  {
@@ -94,36 +94,36 @@
94
  "clip_ratio/low_min": 0.0,
95
  "clip_ratio/region_mean": 0.0,
96
  "completions/clipped_ratio": 0.0,
97
- "completions/max_length": 125.0,
98
- "completions/max_terminated_length": 125.0,
99
- "completions/mean_length": 112.83333206176758,
100
- "completions/mean_terminated_length": 112.83333206176758,
101
- "completions/min_length": 89.5,
102
- "completions/min_terminated_length": 89.5,
103
- "epoch": 5.666666666666667,
104
- "grad_norm": 0.0,
105
  "kl": 0.0,
106
  "learning_rate": 4.472851273490984e-07,
107
- "loss": 0.0,
108
- "num_tokens": 8620.0,
109
- "reward": 6.849369525909424,
110
- "reward_std": 0.0,
111
- "rewards/concensus_correctness_reward_func/mean": 1.9579999446868896,
112
- "rewards/concensus_correctness_reward_func/std": 2.2609035968780518,
113
- "rewards/consensus_reward_func/mean": 1.0,
114
- "rewards/consensus_reward_func/std": 1.154700517654419,
115
  "rewards/cumulative_reward_2/mean": 0.0,
116
  "rewards/cumulative_reward_2/std": 0.0,
117
- "rewards/final_correctness_reward_func/mean": 2.0,
118
- "rewards/final_correctness_reward_func/std": 0.0,
119
- "rewards/question_recreation_reward_func/mean": 0.5163697898387909,
120
- "rewards/question_recreation_reward_func/std": 0.5474509298801422,
121
  "rewards/soft_format_reward_func/mean": 0.0,
122
  "rewards/soft_format_reward_func/std": 0.0,
123
- "rewards/strict_format_reward_func/mean": 0.25,
124
- "rewards/strict_format_reward_func/std": 0.28867512941360474,
125
- "rewards/xmlcount_reward_func/mean": 1.125,
126
- "rewards/xmlcount_reward_func/std": 0.14433756470680237,
127
  "step": 6
128
  },
129
  {
@@ -133,36 +133,36 @@
133
  "clip_ratio/low_min": 0.0,
134
  "clip_ratio/region_mean": 0.0,
135
  "completions/clipped_ratio": 0.0,
136
- "completions/max_length": 123.0,
137
- "completions/max_terminated_length": 123.0,
138
- "completions/mean_length": 100.66666793823242,
139
- "completions/mean_terminated_length": 100.66666793823242,
140
- "completions/min_length": 89.5,
141
- "completions/min_terminated_length": 89.5,
142
- "epoch": 7.666666666666667,
143
- "grad_norm": 0.0,
144
- "kl": 0.0,
145
  "learning_rate": 3.867370395306068e-07,
146
- "loss": 0.0,
147
- "num_tokens": 11522.0,
148
- "reward": 6.181347370147705,
149
- "reward_std": 0.0,
150
- "rewards/concensus_correctness_reward_func/mean": 1.459500014781952,
151
- "rewards/concensus_correctness_reward_func/std": 1.6852854490280151,
152
- "rewards/consensus_reward_func/mean": 1.0,
153
- "rewards/consensus_reward_func/std": 1.154700517654419,
154
  "rewards/cumulative_reward_2/mean": 0.0,
155
  "rewards/cumulative_reward_2/std": 0.0,
156
- "rewards/final_correctness_reward_func/mean": 2.0,
157
- "rewards/final_correctness_reward_func/std": 0.0,
158
- "rewards/question_recreation_reward_func/mean": 0.3468475788831711,
159
- "rewards/question_recreation_reward_func/std": 0.34804321825504303,
160
  "rewards/soft_format_reward_func/mean": 0.0,
161
  "rewards/soft_format_reward_func/std": 0.0,
162
- "rewards/strict_format_reward_func/mean": 0.25,
163
- "rewards/strict_format_reward_func/std": 0.28867512941360474,
164
- "rewards/xmlcount_reward_func/mean": 1.125,
165
- "rewards/xmlcount_reward_func/std": 0.14433756470680237,
166
  "step": 8
167
  },
168
  {
@@ -172,36 +172,36 @@
172
  "clip_ratio/low_min": 0.0,
173
  "clip_ratio/region_mean": 0.0,
174
  "completions/clipped_ratio": 0.0,
175
- "completions/max_length": 112.0,
176
- "completions/max_terminated_length": 112.0,
177
- "completions/mean_length": 97.0,
178
- "completions/mean_terminated_length": 97.0,
179
- "completions/min_length": 89.5,
180
- "completions/min_terminated_length": 89.5,
181
- "epoch": 9.666666666666666,
182
- "grad_norm": 0.0,
183
- "kl": 0.0,
184
  "learning_rate": 3.1137137178519977e-07,
185
  "loss": 0.0,
186
- "num_tokens": 14378.0,
187
- "reward": 7.34932804107666,
188
  "reward_std": 0.0,
189
- "rewards/concensus_correctness_reward_func/mean": 1.940500020980835,
190
- "rewards/concensus_correctness_reward_func/std": 1.1298744678497314,
191
- "rewards/consensus_reward_func/mean": 1.5,
192
- "rewards/consensus_reward_func/std": 0.5773502588272095,
193
  "rewards/cumulative_reward_2/mean": 0.0,
194
  "rewards/cumulative_reward_2/std": 0.0,
195
- "rewards/final_correctness_reward_func/mean": 2.0,
196
- "rewards/final_correctness_reward_func/std": 0.0,
197
- "rewards/question_recreation_reward_func/mean": 0.34632833302021027,
198
- "rewards/question_recreation_reward_func/std": 0.3486427888274193,
199
  "rewards/soft_format_reward_func/mean": 0.0,
200
  "rewards/soft_format_reward_func/std": 0.0,
201
- "rewards/strict_format_reward_func/mean": 0.375,
202
- "rewards/strict_format_reward_func/std": 0.14433756470680237,
203
- "rewards/xmlcount_reward_func/mean": 1.1875,
204
- "rewards/xmlcount_reward_func/std": 0.07216878235340118,
205
  "step": 10
206
  },
207
  {
@@ -211,36 +211,36 @@
211
  "clip_ratio/low_min": 0.0,
212
  "clip_ratio/region_mean": 0.0,
213
  "completions/clipped_ratio": 0.0,
214
- "completions/max_length": 129.0,
215
- "completions/max_terminated_length": 129.0,
216
- "completions/mean_length": 114.16666793823242,
217
- "completions/mean_terminated_length": 114.16666793823242,
218
- "completions/min_length": 89.5,
219
- "completions/min_terminated_length": 89.5,
220
- "epoch": 11.666666666666666,
221
- "grad_norm": 0.0,
222
- "kl": 0.0,
223
  "learning_rate": 2.2935516363191693e-07,
224
  "loss": 0.0,
225
- "num_tokens": 17290.0,
226
- "reward": 6.849369525909424,
227
  "reward_std": 0.0,
228
- "rewards/concensus_correctness_reward_func/mean": 1.9579999446868896,
229
- "rewards/concensus_correctness_reward_func/std": 2.2609035968780518,
230
- "rewards/consensus_reward_func/mean": 1.0,
231
- "rewards/consensus_reward_func/std": 1.154700517654419,
232
  "rewards/cumulative_reward_2/mean": 0.0,
233
  "rewards/cumulative_reward_2/std": 0.0,
234
- "rewards/final_correctness_reward_func/mean": 2.0,
235
- "rewards/final_correctness_reward_func/std": 0.0,
236
- "rewards/question_recreation_reward_func/mean": 0.5163697898387909,
237
- "rewards/question_recreation_reward_func/std": 0.5474509298801422,
238
  "rewards/soft_format_reward_func/mean": 0.0,
239
  "rewards/soft_format_reward_func/std": 0.0,
240
- "rewards/strict_format_reward_func/mean": 0.25,
241
- "rewards/strict_format_reward_func/std": 0.28867512941360474,
242
- "rewards/xmlcount_reward_func/mean": 1.125,
243
- "rewards/xmlcount_reward_func/std": 0.14433756470680237,
244
  "step": 12
245
  },
246
  {
@@ -250,35 +250,35 @@
250
  "clip_ratio/low_min": 0.0,
251
  "clip_ratio/region_mean": 0.0,
252
  "completions/clipped_ratio": 0.0,
253
- "completions/max_length": 101.0,
254
- "completions/max_terminated_length": 101.0,
255
- "completions/mean_length": 85.66666412353516,
256
- "completions/mean_terminated_length": 85.66666412353516,
257
- "completions/min_length": 78.0,
258
- "completions/min_terminated_length": 78.0,
259
- "epoch": 13.666666666666666,
260
- "grad_norm": 0.0,
261
- "kl": 0.0,
262
  "learning_rate": 1.4957614383675767e-07,
263
  "loss": 0.0,
264
- "num_tokens": 20054.0,
265
- "reward": 7.936056613922119,
266
  "reward_std": 0.0,
267
- "rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
268
  "rewards/concensus_correctness_reward_func/std": 0.0,
269
- "rewards/consensus_reward_func/mean": 2.0,
270
  "rewards/consensus_reward_func/std": 0.0,
271
  "rewards/cumulative_reward_2/mean": 0.0,
272
  "rewards/cumulative_reward_2/std": 0.0,
273
- "rewards/final_correctness_reward_func/mean": 2.0,
274
  "rewards/final_correctness_reward_func/std": 0.0,
275
- "rewards/question_recreation_reward_func/mean": 0.2620568871498108,
276
- "rewards/question_recreation_reward_func/std": 0.24471749365329742,
277
  "rewards/soft_format_reward_func/mean": 0.0,
278
  "rewards/soft_format_reward_func/std": 0.0,
279
- "rewards/strict_format_reward_func/mean": 0.5,
280
  "rewards/strict_format_reward_func/std": 0.0,
281
- "rewards/xmlcount_reward_func/mean": 1.25,
282
  "rewards/xmlcount_reward_func/std": 0.0,
283
  "step": 14
284
  },
@@ -289,36 +289,36 @@
289
  "clip_ratio/low_min": 0.0,
290
  "clip_ratio/region_mean": 0.0,
291
  "completions/clipped_ratio": 0.0,
292
- "completions/max_length": 125.0,
293
- "completions/max_terminated_length": 125.0,
294
- "completions/mean_length": 108.83333206176758,
295
- "completions/mean_terminated_length": 108.83333206176758,
296
- "completions/min_length": 89.5,
297
- "completions/min_terminated_length": 89.5,
298
- "epoch": 15.666666666666666,
299
- "grad_norm": 0.0,
300
- "kl": 0.0,
301
  "learning_rate": 8.067960709356478e-08,
302
  "loss": 0.0,
303
- "num_tokens": 22956.0,
304
- "reward": 6.805034160614014,
305
  "reward_std": 0.0,
306
- "rewards/concensus_correctness_reward_func/mean": 1.9574999809265137,
307
- "rewards/concensus_correctness_reward_func/std": 2.2603262662887573,
308
- "rewards/consensus_reward_func/mean": 1.0,
309
- "rewards/consensus_reward_func/std": 1.154700517654419,
310
  "rewards/cumulative_reward_2/mean": 0.0,
311
  "rewards/cumulative_reward_2/std": 0.0,
312
- "rewards/final_correctness_reward_func/mean": 2.0,
313
  "rewards/final_correctness_reward_func/std": 0.0,
314
- "rewards/question_recreation_reward_func/mean": 0.47253431379795074,
315
- "rewards/question_recreation_reward_func/std": 0.5011070221662521,
316
  "rewards/soft_format_reward_func/mean": 0.0,
317
  "rewards/soft_format_reward_func/std": 0.0,
318
- "rewards/strict_format_reward_func/mean": 0.25,
319
- "rewards/strict_format_reward_func/std": 0.28867512941360474,
320
- "rewards/xmlcount_reward_func/mean": 1.125,
321
- "rewards/xmlcount_reward_func/std": 0.14433756470680237,
322
  "step": 16
323
  },
324
  {
@@ -328,36 +328,36 @@
328
  "clip_ratio/low_min": 0.0,
329
  "clip_ratio/region_mean": 0.0,
330
  "completions/clipped_ratio": 0.0,
331
- "completions/max_length": 125.0,
332
- "completions/max_terminated_length": 125.0,
333
- "completions/mean_length": 101.0,
334
- "completions/mean_terminated_length": 101.0,
335
- "completions/min_length": 77.5,
336
- "completions/min_terminated_length": 77.5,
337
- "epoch": 17.666666666666668,
338
- "grad_norm": 5.54299783706665,
339
- "kl": 0.0,
340
  "learning_rate": 3.013156219837776e-08,
341
- "loss": -0.0011,
342
- "num_tokens": 25826.0,
343
- "reward": 6.22217059135437,
344
- "reward_std": 0.0022791330702602863,
345
- "rewards/concensus_correctness_reward_func/mean": 1.4599999785423279,
346
- "rewards/concensus_correctness_reward_func/std": 1.6858627796173096,
347
- "rewards/consensus_reward_func/mean": 1.0,
348
- "rewards/consensus_reward_func/std": 1.154700517654419,
349
  "rewards/cumulative_reward_2/mean": 0.0,
350
  "rewards/cumulative_reward_2/std": 0.0,
351
- "rewards/final_correctness_reward_func/mean": 2.0,
352
- "rewards/final_correctness_reward_func/std": 0.0,
353
- "rewards/question_recreation_reward_func/mean": 0.3871704339981079,
354
- "rewards/question_recreation_reward_func/std": 0.39475004374980927,
355
  "rewards/soft_format_reward_func/mean": 0.0,
356
  "rewards/soft_format_reward_func/std": 0.0,
357
- "rewards/strict_format_reward_func/mean": 0.25,
358
- "rewards/strict_format_reward_func/std": 0.28867512941360474,
359
- "rewards/xmlcount_reward_func/mean": 1.125,
360
- "rewards/xmlcount_reward_func/std": 0.14433756470680237,
361
  "step": 18
362
  },
363
  {
@@ -367,52 +367,52 @@
367
  "clip_ratio/low_min": 0.0,
368
  "clip_ratio/region_mean": 0.0,
369
  "completions/clipped_ratio": 0.0,
370
- "completions/max_length": 125.0,
371
- "completions/max_terminated_length": 125.0,
372
- "completions/mean_length": 105.16666412353516,
373
- "completions/mean_terminated_length": 105.16666412353516,
374
- "completions/min_length": 89.5,
375
- "completions/min_terminated_length": 89.5,
376
- "epoch": 19.666666666666668,
377
- "grad_norm": 7.460154847649392e-06,
378
- "kl": 9.199701889173184e-09,
379
  "learning_rate": 3.4096741493194193e-09,
380
  "loss": 0.0,
381
- "num_tokens": 28731.0,
382
- "reward": 6.225682735443115,
383
  "reward_std": 0.0,
384
- "rewards/concensus_correctness_reward_func/mean": 1.4599999785423279,
385
- "rewards/concensus_correctness_reward_func/std": 1.6858627796173096,
386
- "rewards/consensus_reward_func/mean": 1.0,
387
- "rewards/consensus_reward_func/std": 1.154700517654419,
388
  "rewards/cumulative_reward_2/mean": 0.0,
389
  "rewards/cumulative_reward_2/std": 0.0,
390
- "rewards/final_correctness_reward_func/mean": 2.0,
391
- "rewards/final_correctness_reward_func/std": 0.0,
392
- "rewards/question_recreation_reward_func/mean": 0.39068305492401123,
393
- "rewards/question_recreation_reward_func/std": 0.3943871259689331,
394
  "rewards/soft_format_reward_func/mean": 0.0,
395
  "rewards/soft_format_reward_func/std": 0.0,
396
- "rewards/strict_format_reward_func/mean": 0.25,
397
- "rewards/strict_format_reward_func/std": 0.28867512941360474,
398
- "rewards/xmlcount_reward_func/mean": 1.125,
399
- "rewards/xmlcount_reward_func/std": 0.14433756470680237,
400
  "step": 20
401
  },
402
  {
403
- "epoch": 19.666666666666668,
404
  "step": 20,
405
  "total_flos": 0.0,
406
- "train_loss": -0.00011151888082939157,
407
- "train_runtime": 1866.5324,
408
- "train_samples_per_second": 0.043,
409
- "train_steps_per_second": 0.011
410
  }
411
  ],
412
  "logging_steps": 2,
413
  "max_steps": 20,
414
- "num_input_tokens_seen": 28731,
415
- "num_train_epochs": 20,
416
  "save_steps": 25,
417
  "stateful_callbacks": {
418
  "TrainerControl": {
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 9.8,
6
  "eval_steps": 500,
7
  "global_step": 20,
8
  "is_hyper_param_search": false,
 
16
  "clip_ratio/low_min": 0.0,
17
  "clip_ratio/region_mean": 0.0,
18
  "completions/clipped_ratio": 0.0,
19
+ "completions/max_length": 21.0,
20
+ "completions/max_terminated_length": 21.0,
21
+ "completions/mean_length": 20.75,
22
+ "completions/mean_terminated_length": 20.75,
23
+ "completions/min_length": 20.5,
24
+ "completions/min_terminated_length": 20.5,
25
+ "epoch": 0.8,
26
  "grad_norm": 0.0,
27
  "kl": 0.0,
28
  "learning_rate": 5e-07,
29
  "loss": 0.0,
30
+ "num_tokens": 2214.0,
31
+ "reward": 1.273802012205124,
32
  "reward_std": 0.0,
33
+ "rewards/concensus_correctness_reward_func/mean": 0.0,
34
+ "rewards/concensus_correctness_reward_func/std": 0.0,
35
+ "rewards/consensus_reward_func/mean": 0.0,
36
+ "rewards/consensus_reward_func/std": 0.0,
37
  "rewards/cumulative_reward_2/mean": 0.0,
38
  "rewards/cumulative_reward_2/std": 0.0,
39
+ "rewards/final_correctness_reward_func/mean": 1.0,
40
  "rewards/final_correctness_reward_func/std": 0.0,
41
+ "rewards/question_recreation_reward_func/mean": 0.02580203115940094,
42
+ "rewards/question_recreation_reward_func/std": 0.001379721456032712,
43
  "rewards/soft_format_reward_func/mean": 0.0,
44
  "rewards/soft_format_reward_func/std": 0.0,
45
+ "rewards/strict_format_reward_func/mean": 0.0,
46
+ "rewards/strict_format_reward_func/std": 0.0,
47
+ "rewards/xmlcount_reward_func/mean": 0.24799999594688416,
48
+ "rewards/xmlcount_reward_func/std": 0.0,
49
  "step": 2
50
  },
51
  {
 
55
  "clip_ratio/low_min": 0.0,
56
  "clip_ratio/region_mean": 0.0,
57
  "completions/clipped_ratio": 0.0,
58
+ "completions/max_length": 22.0,
59
+ "completions/max_terminated_length": 22.0,
60
+ "completions/mean_length": 21.25,
61
+ "completions/mean_terminated_length": 21.25,
62
+ "completions/min_length": 20.5,
63
+ "completions/min_terminated_length": 20.5,
64
+ "epoch": 1.8,
65
  "grad_norm": 0.0,
66
  "kl": 0.0,
67
  "learning_rate": 4.864543104251586e-07,
68
  "loss": 0.0,
69
+ "num_tokens": 4432.0,
70
+ "reward": 1.2818379402160645,
71
  "reward_std": 0.0,
72
+ "rewards/concensus_correctness_reward_func/mean": 0.0,
73
+ "rewards/concensus_correctness_reward_func/std": 0.0,
74
+ "rewards/consensus_reward_func/mean": 0.0,
75
+ "rewards/consensus_reward_func/std": 0.0,
76
  "rewards/cumulative_reward_2/mean": 0.0,
77
  "rewards/cumulative_reward_2/std": 0.0,
78
+ "rewards/final_correctness_reward_func/mean": 1.0,
79
+ "rewards/final_correctness_reward_func/std": 1.154700517654419,
80
+ "rewards/question_recreation_reward_func/mean": 0.03383800573647022,
81
+ "rewards/question_recreation_reward_func/std": 0.0016485399100929499,
82
  "rewards/soft_format_reward_func/mean": 0.0,
83
  "rewards/soft_format_reward_func/std": 0.0,
84
+ "rewards/strict_format_reward_func/mean": 0.0,
85
+ "rewards/strict_format_reward_func/std": 0.0,
86
+ "rewards/xmlcount_reward_func/mean": 0.24799999594688416,
87
+ "rewards/xmlcount_reward_func/std": 0.0,
88
  "step": 4
89
  },
90
  {
 
94
  "clip_ratio/low_min": 0.0,
95
  "clip_ratio/region_mean": 0.0,
96
  "completions/clipped_ratio": 0.0,
97
+ "completions/max_length": 21.0,
98
+ "completions/max_terminated_length": 21.0,
99
+ "completions/mean_length": 21.0,
100
+ "completions/mean_terminated_length": 21.0,
101
+ "completions/min_length": 21.0,
102
+ "completions/min_terminated_length": 21.0,
103
+ "epoch": 2.8,
104
+ "grad_norm": 52.8972282409668,
105
  "kl": 0.0,
106
  "learning_rate": 4.472851273490984e-07,
107
+ "loss": -0.0038,
108
+ "num_tokens": 6651.0,
109
+ "reward": 0.8212482184171677,
110
+ "reward_std": 0.0007691325736232102,
111
+ "rewards/concensus_correctness_reward_func/mean": 0.0,
112
+ "rewards/concensus_correctness_reward_func/std": 0.0,
113
+ "rewards/consensus_reward_func/mean": 0.0,
114
+ "rewards/consensus_reward_func/std": 0.0,
115
  "rewards/cumulative_reward_2/mean": 0.0,
116
  "rewards/cumulative_reward_2/std": 0.0,
117
+ "rewards/final_correctness_reward_func/mean": 0.5,
118
+ "rewards/final_correctness_reward_func/std": 0.5773502588272095,
119
+ "rewards/question_recreation_reward_func/mean": 0.07324827183037996,
120
+ "rewards/question_recreation_reward_func/std": 0.001122168148867786,
121
  "rewards/soft_format_reward_func/mean": 0.0,
122
  "rewards/soft_format_reward_func/std": 0.0,
123
+ "rewards/strict_format_reward_func/mean": 0.0,
124
+ "rewards/strict_format_reward_func/std": 0.0,
125
+ "rewards/xmlcount_reward_func/mean": 0.24799999594688416,
126
+ "rewards/xmlcount_reward_func/std": 0.0,
127
  "step": 6
128
  },
129
  {
 
133
  "clip_ratio/low_min": 0.0,
134
  "clip_ratio/region_mean": 0.0,
135
  "completions/clipped_ratio": 0.0,
136
+ "completions/max_length": 21.0,
137
+ "completions/max_terminated_length": 21.0,
138
+ "completions/mean_length": 20.25,
139
+ "completions/mean_terminated_length": 20.25,
140
+ "completions/min_length": 18.0,
141
+ "completions/min_terminated_length": 18.0,
142
+ "epoch": 3.8,
143
+ "grad_norm": 3.309718522359617e-05,
144
+ "kl": 0.004937448339270789,
145
  "learning_rate": 3.867370395306068e-07,
146
+ "loss": -0.0292,
147
+ "num_tokens": 8865.0,
148
+ "reward": 0.8165916055440903,
149
+ "reward_std": 0.0030181410256773233,
150
+ "rewards/concensus_correctness_reward_func/mean": 0.0,
151
+ "rewards/concensus_correctness_reward_func/std": 0.0,
152
+ "rewards/consensus_reward_func/mean": 0.0,
153
+ "rewards/consensus_reward_func/std": 0.0,
154
  "rewards/cumulative_reward_2/mean": 0.0,
155
  "rewards/cumulative_reward_2/std": 0.0,
156
+ "rewards/final_correctness_reward_func/mean": 0.5,
157
+ "rewards/final_correctness_reward_func/std": 0.5773502588272095,
158
+ "rewards/question_recreation_reward_func/mean": 0.06859164033085108,
159
+ "rewards/question_recreation_reward_func/std": 0.00623092451132834,
160
  "rewards/soft_format_reward_func/mean": 0.0,
161
  "rewards/soft_format_reward_func/std": 0.0,
162
+ "rewards/strict_format_reward_func/mean": 0.0,
163
+ "rewards/strict_format_reward_func/std": 0.0,
164
+ "rewards/xmlcount_reward_func/mean": 0.24799999594688416,
165
+ "rewards/xmlcount_reward_func/std": 0.0,
166
  "step": 8
167
  },
168
  {
 
172
  "clip_ratio/low_min": 0.0,
173
  "clip_ratio/region_mean": 0.0,
174
  "completions/clipped_ratio": 0.0,
175
+ "completions/max_length": 21.0,
176
+ "completions/max_terminated_length": 21.0,
177
+ "completions/mean_length": 20.75,
178
+ "completions/mean_terminated_length": 20.75,
179
+ "completions/min_length": 20.5,
180
+ "completions/min_terminated_length": 20.5,
181
+ "epoch": 4.8,
182
+ "grad_norm": 8.56333404186671e-09,
183
+ "kl": 9.624730929802539e-06,
184
  "learning_rate": 3.1137137178519977e-07,
185
  "loss": 0.0,
186
+ "num_tokens": 11079.0,
187
+ "reward": 1.3171695470809937,
188
  "reward_std": 0.0,
189
+ "rewards/concensus_correctness_reward_func/mean": 0.0,
190
+ "rewards/concensus_correctness_reward_func/std": 0.0,
191
+ "rewards/consensus_reward_func/mean": 0.0,
192
+ "rewards/consensus_reward_func/std": 0.0,
193
  "rewards/cumulative_reward_2/mean": 0.0,
194
  "rewards/cumulative_reward_2/std": 0.0,
195
+ "rewards/final_correctness_reward_func/mean": 1.0,
196
+ "rewards/final_correctness_reward_func/std": 1.154700517654419,
197
+ "rewards/question_recreation_reward_func/mean": 0.06916956789791584,
198
+ "rewards/question_recreation_reward_func/std": 0.00016566732665523887,
199
  "rewards/soft_format_reward_func/mean": 0.0,
200
  "rewards/soft_format_reward_func/std": 0.0,
201
+ "rewards/strict_format_reward_func/mean": 0.0,
202
+ "rewards/strict_format_reward_func/std": 0.0,
203
+ "rewards/xmlcount_reward_func/mean": 0.24799999594688416,
204
+ "rewards/xmlcount_reward_func/std": 0.0,
205
  "step": 10
206
  },
207
  {
 
211
  "clip_ratio/low_min": 0.0,
212
  "clip_ratio/region_mean": 0.0,
213
  "completions/clipped_ratio": 0.0,
214
+ "completions/max_length": 22.0,
215
+ "completions/max_terminated_length": 22.0,
216
+ "completions/mean_length": 21.25,
217
+ "completions/mean_terminated_length": 21.25,
218
+ "completions/min_length": 20.5,
219
+ "completions/min_terminated_length": 20.5,
220
+ "epoch": 5.8,
221
+ "grad_norm": 2.63284931634189e-07,
222
+ "kl": 0.004934789435364628,
223
  "learning_rate": 2.2935516363191693e-07,
224
  "loss": 0.0,
225
+ "num_tokens": 13297.0,
226
+ "reward": 0.7760129868984222,
227
  "reward_std": 0.0,
228
+ "rewards/concensus_correctness_reward_func/mean": 0.0,
229
+ "rewards/concensus_correctness_reward_func/std": 0.0,
230
+ "rewards/consensus_reward_func/mean": 0.0,
231
+ "rewards/consensus_reward_func/std": 0.0,
232
  "rewards/cumulative_reward_2/mean": 0.0,
233
  "rewards/cumulative_reward_2/std": 0.0,
234
+ "rewards/final_correctness_reward_func/mean": 0.5,
235
+ "rewards/final_correctness_reward_func/std": 0.5773502588272095,
236
+ "rewards/question_recreation_reward_func/mean": 0.02801300771534443,
237
+ "rewards/question_recreation_reward_func/std": 0.0014285182114690542,
238
  "rewards/soft_format_reward_func/mean": 0.0,
239
  "rewards/soft_format_reward_func/std": 0.0,
240
+ "rewards/strict_format_reward_func/mean": 0.0,
241
+ "rewards/strict_format_reward_func/std": 0.0,
242
+ "rewards/xmlcount_reward_func/mean": 0.24799999594688416,
243
+ "rewards/xmlcount_reward_func/std": 0.0,
244
  "step": 12
245
  },
246
  {
 
250
  "clip_ratio/low_min": 0.0,
251
  "clip_ratio/region_mean": 0.0,
252
  "completions/clipped_ratio": 0.0,
253
+ "completions/max_length": 21.0,
254
+ "completions/max_terminated_length": 21.0,
255
+ "completions/mean_length": 20.75,
256
+ "completions/mean_terminated_length": 20.75,
257
+ "completions/min_length": 20.5,
258
+ "completions/min_terminated_length": 20.5,
259
+ "epoch": 6.8,
260
+ "grad_norm": 1.60744309596339e-06,
261
+ "kl": 0.004935034728207999,
262
  "learning_rate": 1.4957614383675767e-07,
263
  "loss": 0.0,
264
+ "num_tokens": 15515.0,
265
+ "reward": 1.3190129548311234,
266
  "reward_std": 0.0,
267
+ "rewards/concensus_correctness_reward_func/mean": 0.0,
268
  "rewards/concensus_correctness_reward_func/std": 0.0,
269
+ "rewards/consensus_reward_func/mean": 0.0,
270
  "rewards/consensus_reward_func/std": 0.0,
271
  "rewards/cumulative_reward_2/mean": 0.0,
272
  "rewards/cumulative_reward_2/std": 0.0,
273
+ "rewards/final_correctness_reward_func/mean": 1.0,
274
  "rewards/final_correctness_reward_func/std": 0.0,
275
+ "rewards/question_recreation_reward_func/mean": 0.07101300172507763,
276
+ "rewards/question_recreation_reward_func/std": 0.0003500293714751024,
277
  "rewards/soft_format_reward_func/mean": 0.0,
278
  "rewards/soft_format_reward_func/std": 0.0,
279
+ "rewards/strict_format_reward_func/mean": 0.0,
280
  "rewards/strict_format_reward_func/std": 0.0,
281
+ "rewards/xmlcount_reward_func/mean": 0.24799999594688416,
282
  "rewards/xmlcount_reward_func/std": 0.0,
283
  "step": 14
284
  },
 
289
  "clip_ratio/low_min": 0.0,
290
  "clip_ratio/region_mean": 0.0,
291
  "completions/clipped_ratio": 0.0,
292
+ "completions/max_length": 21.0,
293
+ "completions/max_terminated_length": 21.0,
294
+ "completions/mean_length": 20.75,
295
+ "completions/mean_terminated_length": 20.75,
296
+ "completions/min_length": 20.5,
297
+ "completions/min_terminated_length": 20.5,
298
+ "epoch": 7.8,
299
+ "grad_norm": 2.496720981071121e-07,
300
+ "kl": 9.651695065837629e-06,
301
  "learning_rate": 8.067960709356478e-08,
302
  "loss": 0.0,
303
+ "num_tokens": 17729.0,
304
+ "reward": 1.3170444816350937,
305
  "reward_std": 0.0,
306
+ "rewards/concensus_correctness_reward_func/mean": 0.0,
307
+ "rewards/concensus_correctness_reward_func/std": 0.0,
308
+ "rewards/consensus_reward_func/mean": 0.0,
309
+ "rewards/consensus_reward_func/std": 0.0,
310
  "rewards/cumulative_reward_2/mean": 0.0,
311
  "rewards/cumulative_reward_2/std": 0.0,
312
+ "rewards/final_correctness_reward_func/mean": 1.0,
313
  "rewards/final_correctness_reward_func/std": 0.0,
314
+ "rewards/question_recreation_reward_func/mean": 0.06904449872672558,
315
+ "rewards/question_recreation_reward_func/std": 2.124987804563716e-05,
316
  "rewards/soft_format_reward_func/mean": 0.0,
317
  "rewards/soft_format_reward_func/std": 0.0,
318
+ "rewards/strict_format_reward_func/mean": 0.0,
319
+ "rewards/strict_format_reward_func/std": 0.0,
320
+ "rewards/xmlcount_reward_func/mean": 0.24799999594688416,
321
+ "rewards/xmlcount_reward_func/std": 0.0,
322
  "step": 16
323
  },
324
  {
 
328
  "clip_ratio/low_min": 0.0,
329
  "clip_ratio/region_mean": 0.0,
330
  "completions/clipped_ratio": 0.0,
331
+ "completions/max_length": 20.5,
332
+ "completions/max_terminated_length": 20.5,
333
+ "completions/mean_length": 20.5,
334
+ "completions/mean_terminated_length": 20.5,
335
+ "completions/min_length": 20.5,
336
+ "completions/min_terminated_length": 20.5,
337
+ "epoch": 8.8,
338
+ "grad_norm": 4.384970608839467e-09,
339
+ "kl": 9.651695065837629e-06,
340
  "learning_rate": 3.013156219837776e-08,
341
+ "loss": 0.0,
342
+ "num_tokens": 19943.0,
343
+ "reward": 1.3192957043647766,
344
+ "reward_std": 0.0,
345
+ "rewards/concensus_correctness_reward_func/mean": 0.0,
346
+ "rewards/concensus_correctness_reward_func/std": 0.0,
347
+ "rewards/consensus_reward_func/mean": 0.0,
348
+ "rewards/consensus_reward_func/std": 0.0,
349
  "rewards/cumulative_reward_2/mean": 0.0,
350
  "rewards/cumulative_reward_2/std": 0.0,
351
+ "rewards/final_correctness_reward_func/mean": 1.0,
352
+ "rewards/final_correctness_reward_func/std": 1.154700517654419,
353
+ "rewards/question_recreation_reward_func/mean": 0.07129578851163387,
354
+ "rewards/question_recreation_reward_func/std": 2.34974613704253e-05,
355
  "rewards/soft_format_reward_func/mean": 0.0,
356
  "rewards/soft_format_reward_func/std": 0.0,
357
+ "rewards/strict_format_reward_func/mean": 0.0,
358
+ "rewards/strict_format_reward_func/std": 0.0,
359
+ "rewards/xmlcount_reward_func/mean": 0.24799999594688416,
360
+ "rewards/xmlcount_reward_func/std": 0.0,
361
  "step": 18
362
  },
363
  {
 
367
  "clip_ratio/low_min": 0.0,
368
  "clip_ratio/region_mean": 0.0,
369
  "completions/clipped_ratio": 0.0,
370
+ "completions/max_length": 22.0,
371
+ "completions/max_terminated_length": 22.0,
372
+ "completions/mean_length": 21.25,
373
+ "completions/mean_terminated_length": 21.25,
374
+ "completions/min_length": 20.5,
375
+ "completions/min_terminated_length": 20.5,
376
+ "epoch": 9.8,
377
+ "grad_norm": 7.085214193125466e-09,
378
+ "kl": 0.00492547295011958,
379
  "learning_rate": 3.4096741493194193e-09,
380
  "loss": 0.0,
381
+ "num_tokens": 22161.0,
382
+ "reward": 1.2748364806175232,
383
  "reward_std": 0.0,
384
+ "rewards/concensus_correctness_reward_func/mean": 0.0,
385
+ "rewards/concensus_correctness_reward_func/std": 0.0,
386
+ "rewards/consensus_reward_func/mean": 0.0,
387
+ "rewards/consensus_reward_func/std": 0.0,
388
  "rewards/cumulative_reward_2/mean": 0.0,
389
  "rewards/cumulative_reward_2/std": 0.0,
390
+ "rewards/final_correctness_reward_func/mean": 1.0,
391
+ "rewards/final_correctness_reward_func/std": 1.154700517654419,
392
+ "rewards/question_recreation_reward_func/mean": 0.026836536824703217,
393
+ "rewards/question_recreation_reward_func/std": 7.004663348197937e-05,
394
  "rewards/soft_format_reward_func/mean": 0.0,
395
  "rewards/soft_format_reward_func/std": 0.0,
396
+ "rewards/strict_format_reward_func/mean": 0.0,
397
+ "rewards/strict_format_reward_func/std": 0.0,
398
+ "rewards/xmlcount_reward_func/mean": 0.24799999594688416,
399
+ "rewards/xmlcount_reward_func/std": 0.0,
400
  "step": 20
401
  },
402
  {
403
+ "epoch": 9.8,
404
  "step": 20,
405
  "total_flos": 0.0,
406
+ "train_loss": -0.003300715550648903,
407
+ "train_runtime": 932.3793,
408
+ "train_samples_per_second": 0.086,
409
+ "train_steps_per_second": 0.021
410
  }
411
  ],
412
  "logging_steps": 2,
413
  "max_steps": 20,
414
+ "num_input_tokens_seen": 22161,
415
+ "num_train_epochs": 10,
416
  "save_steps": 25,
417
  "stateful_callbacks": {
418
  "TrainerControl": {