Avokado777 commited on
Commit
d3daf77
·
verified ·
1 Parent(s): 427b7a8

End of training

Browse files
Files changed (4) hide show
  1. all_results.json +5 -5
  2. model.safetensors +1 -1
  3. train_results.json +5 -5
  4. trainer_state.json +246 -246
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.012940555065870284,
4
- "train_runtime": 4717.7232,
5
- "train_samples": 3,
6
- "train_samples_per_second": 0.017,
7
- "train_steps_per_second": 0.004
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.037779451161623,
4
+ "train_runtime": 7169.9681,
5
+ "train_samples": 5,
6
+ "train_samples_per_second": 0.011,
7
+ "train_steps_per_second": 0.003
8
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:006bbd06767f2805a055debcec52ce7472120081e31468d9fd14e0ba06b082b5
3
  size 1976163472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:adecc22f303850912efbf20277ad42ab67f6089c6d70133581f073f6e2538487
3
  size 1976163472
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.012940555065870284,
4
- "train_runtime": 4717.7232,
5
- "train_samples": 3,
6
- "train_samples_per_second": 0.017,
7
- "train_steps_per_second": 0.004
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.037779451161623,
4
+ "train_runtime": 7169.9681,
5
+ "train_samples": 5,
6
+ "train_samples_per_second": 0.011,
7
+ "train_steps_per_second": 0.003
8
  }
trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 19.666666666666668,
6
  "eval_steps": 500,
7
  "global_step": 20,
8
  "is_hyper_param_search": false,
@@ -15,37 +15,37 @@
15
  "clip_ratio/low_mean": 0.0,
16
  "clip_ratio/low_min": 0.0,
17
  "clip_ratio/region_mean": 0.0,
18
- "completions/clipped_ratio": 0.0,
19
- "completions/max_length": 152.0,
20
- "completions/max_terminated_length": 152.0,
21
- "completions/mean_length": 113.66666793823242,
22
- "completions/mean_terminated_length": 113.66666793823242,
23
- "completions/min_length": 69.5,
24
- "completions/min_terminated_length": 69.5,
25
- "epoch": 1.6666666666666665,
26
- "grad_norm": 13.65064525604248,
27
  "kl": 0.0,
28
  "learning_rate": 5e-07,
29
- "loss": 0.094,
30
- "num_tokens": 3132.0,
31
- "reward": 1.9971750974655151,
32
- "reward_std": 1.0149143934249878,
33
- "rewards/concensus_correctness_reward_func/mean": 0.24050000309944153,
34
- "rewards/concensus_correctness_reward_func/std": 0.48100003600120544,
35
- "rewards/consensus_reward_func/mean": 0.25,
36
- "rewards/consensus_reward_func/std": 0.5,
37
  "rewards/cumulative_reward_2/mean": 0.0,
38
  "rewards/cumulative_reward_2/std": 0.0,
39
  "rewards/final_correctness_reward_func/mean": 0.0,
40
  "rewards/final_correctness_reward_func/std": 0.0,
41
- "rewards/question_recreation_reward_func/mean": 0.1785501390695572,
42
- "rewards/question_recreation_reward_func/std": 0.24844325333833694,
43
  "rewards/soft_format_reward_func/mean": 0.0,
44
  "rewards/soft_format_reward_func/std": 0.0,
45
- "rewards/strict_format_reward_func/mean": 0.25,
46
- "rewards/strict_format_reward_func/std": 0.28867512941360474,
47
- "rewards/xmlcount_reward_func/mean": 1.078125,
48
- "rewards/xmlcount_reward_func/std": 0.3098391965031624,
49
  "step": 2
50
  },
51
  {
@@ -55,36 +55,36 @@
55
  "clip_ratio/low_min": 0.0,
56
  "clip_ratio/region_mean": 0.0,
57
  "completions/clipped_ratio": 0.0,
58
- "completions/max_length": 121.5,
59
- "completions/max_terminated_length": 121.5,
60
- "completions/mean_length": 93.16666793823242,
61
- "completions/mean_terminated_length": 93.16666793823242,
62
- "completions/min_length": 66.0,
63
- "completions/min_terminated_length": 66.0,
64
- "epoch": 3.6666666666666665,
65
- "grad_norm": 10.07153606414795,
66
- "kl": 0.003774605953367427,
67
  "learning_rate": 4.864543104251586e-07,
68
- "loss": 0.0403,
69
- "num_tokens": 6059.0,
70
- "reward": 2.220690667629242,
71
- "reward_std": 0.9036273024976254,
72
- "rewards/concensus_correctness_reward_func/mean": 0.24050000309944153,
73
- "rewards/concensus_correctness_reward_func/std": 0.48100003600120544,
74
- "rewards/consensus_reward_func/mean": 0.25,
75
- "rewards/consensus_reward_func/std": 0.5,
76
  "rewards/cumulative_reward_2/mean": 0.0,
77
  "rewards/cumulative_reward_2/std": 0.0,
78
  "rewards/final_correctness_reward_func/mean": 0.0,
79
  "rewards/final_correctness_reward_func/std": 0.0,
80
- "rewards/question_recreation_reward_func/mean": 0.1989407055079937,
81
- "rewards/question_recreation_reward_func/std": 0.28088878467679024,
82
  "rewards/soft_format_reward_func/mean": 0.0,
83
  "rewards/soft_format_reward_func/std": 0.0,
84
- "rewards/strict_format_reward_func/mean": 0.375,
85
- "rewards/strict_format_reward_func/std": 0.25,
86
- "rewards/xmlcount_reward_func/mean": 1.15625,
87
- "rewards/xmlcount_reward_func/std": 0.1875,
88
  "step": 4
89
  },
90
  {
@@ -93,37 +93,37 @@
93
  "clip_ratio/low_mean": 0.0,
94
  "clip_ratio/low_min": 0.0,
95
  "clip_ratio/region_mean": 0.0,
96
- "completions/clipped_ratio": 0.0,
97
- "completions/max_length": 159.0,
98
- "completions/max_terminated_length": 159.0,
99
- "completions/mean_length": 113.83333206176758,
100
- "completions/mean_terminated_length": 113.83333206176758,
101
- "completions/min_length": 85.0,
102
- "completions/min_terminated_length": 85.0,
103
- "epoch": 5.666666666666667,
104
- "grad_norm": 18.319076538085938,
105
- "kl": 0.02211279346374795,
106
  "learning_rate": 4.472851273490984e-07,
107
- "loss": -0.1592,
108
- "num_tokens": 9065.0,
109
- "reward": 3.014554500579834,
110
- "reward_std": 0.5706639671698213,
111
- "rewards/concensus_correctness_reward_func/mean": 0.7215000092983246,
112
- "rewards/concensus_correctness_reward_func/std": 1.0364110171794891,
113
- "rewards/consensus_reward_func/mean": 0.75,
114
- "rewards/consensus_reward_func/std": 1.0773502588272095,
115
  "rewards/cumulative_reward_2/mean": 0.0,
116
  "rewards/cumulative_reward_2/std": 0.0,
117
  "rewards/final_correctness_reward_func/mean": 0.0,
118
  "rewards/final_correctness_reward_func/std": 0.0,
119
- "rewards/question_recreation_reward_func/mean": 0.07430431246757507,
120
- "rewards/question_recreation_reward_func/std": 0.05623785126954317,
121
  "rewards/soft_format_reward_func/mean": 0.0,
122
  "rewards/soft_format_reward_func/std": 0.0,
123
- "rewards/strict_format_reward_func/mean": 0.3125,
124
- "rewards/strict_format_reward_func/std": 0.125,
125
- "rewards/xmlcount_reward_func/mean": 1.15625,
126
- "rewards/xmlcount_reward_func/std": 0.0625,
127
  "step": 6
128
  },
129
  {
@@ -132,37 +132,37 @@
132
  "clip_ratio/low_mean": 0.0,
133
  "clip_ratio/low_min": 0.0,
134
  "clip_ratio/region_mean": 0.0,
135
- "completions/clipped_ratio": 0.16666666666666669,
136
- "completions/max_length": 316.0,
137
- "completions/max_terminated_length": 130.5,
138
- "completions/mean_length": 180.8333282470703,
139
- "completions/mean_terminated_length": 118.0,
140
- "completions/min_length": 101.5,
141
- "completions/min_terminated_length": 101.5,
142
- "epoch": 7.666666666666667,
143
- "grad_norm": 9.818954467773438,
144
- "kl": 0.01149655319750309,
145
  "learning_rate": 3.867370395306068e-07,
146
- "loss": 0.0247,
147
- "num_tokens": 12399.0,
148
- "reward": 2.7228788137435913,
149
- "reward_std": 1.667017936706543,
150
- "rewards/concensus_correctness_reward_func/mean": 0.478999987244606,
151
- "rewards/concensus_correctness_reward_func/std": 0.9579999446868896,
152
- "rewards/consensus_reward_func/mean": 0.5,
153
- "rewards/consensus_reward_func/std": 1.0,
154
  "rewards/cumulative_reward_2/mean": 0.0,
155
  "rewards/cumulative_reward_2/std": 0.0,
156
  "rewards/final_correctness_reward_func/mean": 0.0,
157
  "rewards/final_correctness_reward_func/std": 0.0,
158
- "rewards/question_recreation_reward_func/mean": 0.19700367003679276,
159
- "rewards/question_recreation_reward_func/std": 0.27262982353568077,
160
  "rewards/soft_format_reward_func/mean": 0.0,
161
  "rewards/soft_format_reward_func/std": 0.0,
162
- "rewards/strict_format_reward_func/mean": 0.375,
163
- "rewards/strict_format_reward_func/std": 0.25,
164
- "rewards/xmlcount_reward_func/mean": 1.171875,
165
- "rewards/xmlcount_reward_func/std": 0.15625,
166
  "step": 8
167
  },
168
  {
@@ -172,36 +172,36 @@
172
  "clip_ratio/low_min": 0.0,
173
  "clip_ratio/region_mean": 0.0,
174
  "completions/clipped_ratio": 0.0,
175
- "completions/max_length": 225.5,
176
- "completions/max_terminated_length": 225.5,
177
- "completions/mean_length": 149.5,
178
- "completions/mean_terminated_length": 149.5,
179
- "completions/min_length": 92.0,
180
- "completions/min_terminated_length": 92.0,
181
- "epoch": 9.666666666666666,
182
- "grad_norm": 12.751887321472168,
183
- "kl": 0.038790178950876,
184
  "learning_rate": 3.1137137178519977e-07,
185
- "loss": 0.1765,
186
- "num_tokens": 15535.0,
187
- "reward": 4.8462889194488525,
188
- "reward_std": 1.6064382195472717,
189
- "rewards/concensus_correctness_reward_func/mean": 1.4369999766349792,
190
- "rewards/concensus_correctness_reward_func/std": 0.957999974489212,
191
- "rewards/consensus_reward_func/mean": 1.5,
192
- "rewards/consensus_reward_func/std": 1.0,
193
  "rewards/cumulative_reward_2/mean": 0.0,
194
  "rewards/cumulative_reward_2/std": 0.0,
195
- "rewards/final_correctness_reward_func/mean": 0.0,
196
- "rewards/final_correctness_reward_func/std": 0.0,
197
- "rewards/question_recreation_reward_func/mean": 0.3467889428138733,
198
- "rewards/question_recreation_reward_func/std": 0.4016842842102051,
199
  "rewards/soft_format_reward_func/mean": 0.0,
200
  "rewards/soft_format_reward_func/std": 0.0,
201
- "rewards/strict_format_reward_func/mean": 0.3125,
202
- "rewards/strict_format_reward_func/std": 0.26933756470680237,
203
- "rewards/xmlcount_reward_func/mean": 1.25,
204
- "rewards/xmlcount_reward_func/std": 0.0,
205
  "step": 10
206
  },
207
  {
@@ -210,37 +210,37 @@
210
  "clip_ratio/low_mean": 0.0,
211
  "clip_ratio/low_min": 0.0,
212
  "clip_ratio/region_mean": 0.0,
213
- "completions/clipped_ratio": 0.0,
214
- "completions/max_length": 185.5,
215
- "completions/max_terminated_length": 185.5,
216
- "completions/mean_length": 126.0,
217
- "completions/mean_terminated_length": 126.0,
218
- "completions/min_length": 94.5,
219
- "completions/min_terminated_length": 94.5,
220
- "epoch": 11.666666666666666,
221
- "grad_norm": 9.57380485534668,
222
- "kl": 0.03469763998873532,
223
  "learning_rate": 2.2935516363191693e-07,
224
- "loss": -0.0734,
225
- "num_tokens": 18587.0,
226
- "reward": 4.602630615234375,
227
- "reward_std": 1.6934871673583984,
228
- "rewards/concensus_correctness_reward_func/mean": 1.443000078201294,
229
- "rewards/concensus_correctness_reward_func/std": 0.9620000123977661,
230
- "rewards/consensus_reward_func/mean": 1.5,
231
- "rewards/consensus_reward_func/std": 1.0,
232
  "rewards/cumulative_reward_2/mean": 0.0,
233
  "rewards/cumulative_reward_2/std": 0.0,
234
  "rewards/final_correctness_reward_func/mean": 0.0,
235
  "rewards/final_correctness_reward_func/std": 0.0,
236
- "rewards/question_recreation_reward_func/mean": 0.12838061526417732,
237
- "rewards/question_recreation_reward_func/std": 0.12217769771814346,
238
  "rewards/soft_format_reward_func/mean": 0.0,
239
  "rewards/soft_format_reward_func/std": 0.0,
240
- "rewards/strict_format_reward_func/mean": 0.375,
241
- "rewards/strict_format_reward_func/std": 0.25,
242
- "rewards/xmlcount_reward_func/mean": 1.15625,
243
- "rewards/xmlcount_reward_func/std": 0.1875,
244
  "step": 12
245
  },
246
  {
@@ -250,36 +250,36 @@
250
  "clip_ratio/low_min": 0.0,
251
  "clip_ratio/region_mean": 0.0,
252
  "completions/clipped_ratio": 0.0,
253
- "completions/max_length": 152.0,
254
- "completions/max_terminated_length": 152.0,
255
- "completions/mean_length": 139.33333587646484,
256
- "completions/mean_terminated_length": 139.33333587646484,
257
- "completions/min_length": 122.0,
258
- "completions/min_terminated_length": 122.0,
259
- "epoch": 13.666666666666666,
260
- "grad_norm": 9.227038383483887,
261
- "kl": 0.04416784085333347,
262
  "learning_rate": 1.4957614383675767e-07,
263
- "loss": 0.0101,
264
- "num_tokens": 21700.0,
265
- "reward": 4.582450866699219,
266
- "reward_std": 2.280969023704529,
267
- "rewards/concensus_correctness_reward_func/mean": 1.1999999582767487,
268
- "rewards/concensus_correctness_reward_func/std": 1.0342562794685364,
269
- "rewards/consensus_reward_func/mean": 1.25,
270
- "rewards/consensus_reward_func/std": 1.0773502588272095,
271
  "rewards/cumulative_reward_2/mean": 0.0,
272
  "rewards/cumulative_reward_2/std": 0.0,
273
  "rewards/final_correctness_reward_func/mean": 0.0,
274
  "rewards/final_correctness_reward_func/std": 0.0,
275
- "rewards/question_recreation_reward_func/mean": 0.5230759084224701,
276
- "rewards/question_recreation_reward_func/std": 0.5501338243484497,
277
  "rewards/soft_format_reward_func/mean": 0.0,
278
  "rewards/soft_format_reward_func/std": 0.0,
279
- "rewards/strict_format_reward_func/mean": 0.375,
280
- "rewards/strict_format_reward_func/std": 0.14433756470680237,
281
- "rewards/xmlcount_reward_func/mean": 1.234375,
282
- "rewards/xmlcount_reward_func/std": 0.03125,
283
  "step": 14
284
  },
285
  {
@@ -288,37 +288,37 @@
288
  "clip_ratio/low_mean": 0.0,
289
  "clip_ratio/low_min": 0.0,
290
  "clip_ratio/region_mean": 0.0,
291
- "completions/clipped_ratio": 0.16666666666666669,
292
- "completions/max_length": 383.5,
293
- "completions/max_terminated_length": 302.5,
294
- "completions/mean_length": 249.16666412353516,
295
- "completions/mean_terminated_length": 203.58333587646484,
296
- "completions/min_length": 124.0,
297
- "completions/min_terminated_length": 124.0,
298
- "epoch": 15.666666666666666,
299
- "grad_norm": 12.671793937683105,
300
- "kl": 0.039287379826419055,
301
  "learning_rate": 8.067960709356478e-08,
302
- "loss": -0.0068,
303
- "num_tokens": 25565.0,
304
- "reward": 3.1929807662963867,
305
- "reward_std": 0.9375366121530533,
306
- "rewards/concensus_correctness_reward_func/mean": 0.7184999883174896,
307
- "rewards/concensus_correctness_reward_func/std": 1.032946914434433,
308
- "rewards/consensus_reward_func/mean": 0.75,
309
- "rewards/consensus_reward_func/std": 1.0773502588272095,
310
  "rewards/cumulative_reward_2/mean": 0.0,
311
  "rewards/cumulative_reward_2/std": 0.0,
312
  "rewards/final_correctness_reward_func/mean": 0.0,
313
  "rewards/final_correctness_reward_func/std": 0.0,
314
- "rewards/question_recreation_reward_func/mean": 0.40010587871074677,
315
- "rewards/question_recreation_reward_func/std": 0.45765654742717743,
316
  "rewards/soft_format_reward_func/mean": 0.0,
317
  "rewards/soft_format_reward_func/std": 0.0,
318
- "rewards/strict_format_reward_func/mean": 0.25,
319
- "rewards/strict_format_reward_func/std": 0.28867512941360474,
320
- "rewards/xmlcount_reward_func/mean": 1.074375033378601,
321
- "rewards/xmlcount_reward_func/std": 0.3512499928474426,
322
  "step": 16
323
  },
324
  {
@@ -327,37 +327,37 @@
327
  "clip_ratio/low_mean": 0.0,
328
  "clip_ratio/low_min": 0.0,
329
  "clip_ratio/region_mean": 0.0,
330
- "completions/clipped_ratio": 0.0,
331
- "completions/max_length": 350.5,
332
- "completions/max_terminated_length": 350.5,
333
- "completions/mean_length": 220.99999237060547,
334
- "completions/mean_terminated_length": 220.99999237060547,
335
- "completions/min_length": 98.5,
336
- "completions/min_terminated_length": 98.5,
337
- "epoch": 17.666666666666668,
338
- "grad_norm": 7.500455856323242,
339
- "kl": 0.01603590976446867,
340
  "learning_rate": 3.013156219837776e-08,
341
- "loss": 0.0981,
342
- "num_tokens": 29204.0,
343
- "reward": 5.051450490951538,
344
- "reward_std": 1.452622190117836,
345
- "rewards/concensus_correctness_reward_func/mean": 1.6819999814033508,
346
- "rewards/concensus_correctness_reward_func/std": 0.48000001907348633,
347
- "rewards/consensus_reward_func/mean": 1.75,
348
- "rewards/consensus_reward_func/std": 0.5,
349
  "rewards/cumulative_reward_2/mean": 0.0,
350
  "rewards/cumulative_reward_2/std": 0.0,
351
  "rewards/final_correctness_reward_func/mean": 0.0,
352
  "rewards/final_correctness_reward_func/std": 0.0,
353
- "rewards/question_recreation_reward_func/mean": 0.2913255840539932,
354
- "rewards/question_recreation_reward_func/std": 0.4711288511753082,
355
  "rewards/soft_format_reward_func/mean": 0.0,
356
  "rewards/soft_format_reward_func/std": 0.0,
357
- "rewards/strict_format_reward_func/mean": 0.3125,
358
- "rewards/strict_format_reward_func/std": 0.26933756470680237,
359
- "rewards/xmlcount_reward_func/mean": 1.015625,
360
- "rewards/xmlcount_reward_func/std": 0.2991959750652313,
361
  "step": 18
362
  },
363
  {
@@ -366,53 +366,53 @@
366
  "clip_ratio/low_mean": 0.0,
367
  "clip_ratio/low_min": 0.0,
368
  "clip_ratio/region_mean": 0.0,
369
- "completions/clipped_ratio": 0.0,
370
- "completions/max_length": 191.0,
371
- "completions/max_terminated_length": 191.0,
372
- "completions/mean_length": 154.83333587646484,
373
- "completions/mean_terminated_length": 154.83333587646484,
374
- "completions/min_length": 128.0,
375
- "completions/min_terminated_length": 128.0,
376
- "epoch": 19.666666666666668,
377
- "grad_norm": 5.4542059898376465,
378
- "kl": 0.0275694252923131,
379
  "learning_rate": 3.4096741493194193e-09,
380
- "loss": -0.0749,
381
- "num_tokens": 32380.0,
382
- "reward": 5.182299613952637,
383
- "reward_std": 0.9995275139808655,
384
- "rewards/concensus_correctness_reward_func/mean": 1.6819999814033508,
385
- "rewards/concensus_correctness_reward_func/std": 0.47999998927116394,
386
- "rewards/consensus_reward_func/mean": 1.75,
387
- "rewards/consensus_reward_func/std": 0.5,
388
  "rewards/cumulative_reward_2/mean": 0.0,
389
  "rewards/cumulative_reward_2/std": 0.0,
390
  "rewards/final_correctness_reward_func/mean": 0.0,
391
  "rewards/final_correctness_reward_func/std": 0.0,
392
- "rewards/question_recreation_reward_func/mean": 0.2502993941307068,
393
- "rewards/question_recreation_reward_func/std": 0.3478831350803375,
394
  "rewards/soft_format_reward_func/mean": 0.0,
395
  "rewards/soft_format_reward_func/std": 0.0,
396
- "rewards/strict_format_reward_func/mean": 0.3125,
397
- "rewards/strict_format_reward_func/std": 0.26933756470680237,
398
- "rewards/xmlcount_reward_func/mean": 1.1875,
399
- "rewards/xmlcount_reward_func/std": 0.0883883461356163,
400
  "step": 20
401
  },
402
  {
403
- "epoch": 19.666666666666668,
404
  "step": 20,
405
  "total_flos": 0.0,
406
- "train_loss": 0.012940555065870284,
407
- "train_runtime": 4717.7232,
408
- "train_samples_per_second": 0.017,
409
- "train_steps_per_second": 0.004
410
  }
411
  ],
412
  "logging_steps": 2,
413
  "max_steps": 20,
414
- "num_input_tokens_seen": 32380,
415
- "num_train_epochs": 20,
416
  "save_steps": 25,
417
  "stateful_callbacks": {
418
  "TrainerControl": {
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 9.8,
6
  "eval_steps": 500,
7
  "global_step": 20,
8
  "is_hyper_param_search": false,
 
15
  "clip_ratio/low_mean": 0.0,
16
  "clip_ratio/low_min": 0.0,
17
  "clip_ratio/region_mean": 0.0,
18
+ "completions/clipped_ratio": 0.125,
19
+ "completions/max_length": 302.0,
20
+ "completions/max_terminated_length": 92.5,
21
+ "completions/mean_length": 123.375,
22
+ "completions/mean_terminated_length": 63.83333396911621,
23
+ "completions/min_length": 48.0,
24
+ "completions/min_terminated_length": 48.0,
25
+ "epoch": 0.8,
26
+ "grad_norm": 8.92754077911377,
27
  "kl": 0.0,
28
  "learning_rate": 5e-07,
29
+ "loss": -0.1218,
30
+ "num_tokens": 3214.0,
31
+ "reward": 0.3856615126132965,
32
+ "reward_std": 0.7816920205950737,
33
+ "rewards/concensus_correctness_reward_func/mean": 0.4375,
34
+ "rewards/concensus_correctness_reward_func/std": 0.875,
35
+ "rewards/consensus_reward_func/mean": 0.0,
36
+ "rewards/consensus_reward_func/std": 0.0,
37
  "rewards/cumulative_reward_2/mean": 0.0,
38
  "rewards/cumulative_reward_2/std": 0.0,
39
  "rewards/final_correctness_reward_func/mean": 0.0,
40
  "rewards/final_correctness_reward_func/std": 0.0,
41
+ "rewards/question_recreation_reward_func/mean": 0.02203650400042534,
42
+ "rewards/question_recreation_reward_func/std": 0.023317964747548103,
43
  "rewards/soft_format_reward_func/mean": 0.0,
44
  "rewards/soft_format_reward_func/std": 0.0,
45
+ "rewards/strict_format_reward_func/mean": 0.0,
46
+ "rewards/strict_format_reward_func/std": 0.0,
47
+ "rewards/xmlcount_reward_func/mean": -0.07387500256299973,
48
+ "rewards/xmlcount_reward_func/std": 0.269045926630497,
49
  "step": 2
50
  },
51
  {
 
55
  "clip_ratio/low_min": 0.0,
56
  "clip_ratio/region_mean": 0.0,
57
  "completions/clipped_ratio": 0.0,
58
+ "completions/max_length": 204.5,
59
+ "completions/max_terminated_length": 204.5,
60
+ "completions/mean_length": 108.875,
61
+ "completions/mean_terminated_length": 108.875,
62
+ "completions/min_length": 34.0,
63
+ "completions/min_terminated_length": 34.0,
64
+ "epoch": 1.8,
65
+ "grad_norm": 14.86384391784668,
66
+ "kl": 0.001416065962985158,
67
  "learning_rate": 4.864543104251586e-07,
68
+ "loss": 0.3279,
69
+ "num_tokens": 6661.0,
70
+ "reward": 0.022667515091598034,
71
+ "reward_std": 0.010929046286037192,
72
+ "rewards/concensus_correctness_reward_func/mean": 0.0,
73
+ "rewards/concensus_correctness_reward_func/std": 0.0,
74
+ "rewards/consensus_reward_func/mean": 0.0,
75
+ "rewards/consensus_reward_func/std": 0.0,
76
  "rewards/cumulative_reward_2/mean": 0.0,
77
  "rewards/cumulative_reward_2/std": 0.0,
78
  "rewards/final_correctness_reward_func/mean": 0.0,
79
  "rewards/final_correctness_reward_func/std": 0.0,
80
+ "rewards/question_recreation_reward_func/mean": 0.015667515341192484,
81
+ "rewards/question_recreation_reward_func/std": 0.010519207920879126,
82
  "rewards/soft_format_reward_func/mean": 0.0,
83
  "rewards/soft_format_reward_func/std": 0.0,
84
+ "rewards/strict_format_reward_func/mean": 0.0,
85
+ "rewards/strict_format_reward_func/std": 0.0,
86
+ "rewards/xmlcount_reward_func/mean": 0.007000000216066837,
87
+ "rewards/xmlcount_reward_func/std": 0.01400000136345625,
88
  "step": 4
89
  },
90
  {
 
93
  "clip_ratio/low_mean": 0.0,
94
  "clip_ratio/low_min": 0.0,
95
  "clip_ratio/region_mean": 0.0,
96
+ "completions/clipped_ratio": 0.5,
97
+ "completions/max_length": 412.0,
98
+ "completions/max_terminated_length": 156.0,
99
+ "completions/mean_length": 316.125,
100
+ "completions/mean_terminated_length": 60.125,
101
+ "completions/min_length": 270.5,
102
+ "completions/min_terminated_length": 14.5,
103
+ "epoch": 2.8,
104
+ "grad_norm": 7.786389350891113,
105
+ "kl": 0.0026882924139499664,
106
  "learning_rate": 4.472851273490984e-07,
107
+ "loss": 0.178,
108
+ "num_tokens": 10480.0,
109
+ "reward": 0.030308596324175596,
110
+ "reward_std": 0.0613415464758873,
111
+ "rewards/concensus_correctness_reward_func/mean": 0.0,
112
+ "rewards/concensus_correctness_reward_func/std": 0.0,
113
+ "rewards/consensus_reward_func/mean": 0.0,
114
+ "rewards/consensus_reward_func/std": 0.0,
115
  "rewards/cumulative_reward_2/mean": 0.0,
116
  "rewards/cumulative_reward_2/std": 0.0,
117
  "rewards/final_correctness_reward_func/mean": 0.0,
118
  "rewards/final_correctness_reward_func/std": 0.0,
119
+ "rewards/question_recreation_reward_func/mean": 0.013933598063886166,
120
+ "rewards/question_recreation_reward_func/std": 0.014677805360406637,
121
  "rewards/soft_format_reward_func/mean": 0.0,
122
  "rewards/soft_format_reward_func/std": 0.0,
123
+ "rewards/strict_format_reward_func/mean": 0.0,
124
+ "rewards/strict_format_reward_func/std": 0.0,
125
+ "rewards/xmlcount_reward_func/mean": 0.016375000588595867,
126
+ "rewards/xmlcount_reward_func/std": 0.051155371591448784,
127
  "step": 6
128
  },
129
  {
 
132
  "clip_ratio/low_mean": 0.0,
133
  "clip_ratio/low_min": 0.0,
134
  "clip_ratio/region_mean": 0.0,
135
+ "completions/clipped_ratio": 0.5,
136
+ "completions/max_length": 447.5,
137
+ "completions/max_terminated_length": 191.5,
138
+ "completions/mean_length": 309.25,
139
+ "completions/mean_terminated_length": 53.25,
140
+ "completions/min_length": 258.0,
141
+ "completions/min_terminated_length": 2.0,
142
+ "epoch": 3.8,
143
+ "grad_norm": 5.402748107910156,
144
+ "kl": 0.005952609681116883,
145
  "learning_rate": 3.867370395306068e-07,
146
+ "loss": -0.1153,
147
+ "num_tokens": 14444.0,
148
+ "reward": 0.01363831665366888,
149
+ "reward_std": 0.0067086233757436275,
150
+ "rewards/concensus_correctness_reward_func/mean": 0.0,
151
+ "rewards/concensus_correctness_reward_func/std": 0.0,
152
+ "rewards/consensus_reward_func/mean": 0.0,
153
+ "rewards/consensus_reward_func/std": 0.0,
154
  "rewards/cumulative_reward_2/mean": 0.0,
155
  "rewards/cumulative_reward_2/std": 0.0,
156
  "rewards/final_correctness_reward_func/mean": 0.0,
157
  "rewards/final_correctness_reward_func/std": 0.0,
158
+ "rewards/question_recreation_reward_func/mean": 0.01363831665366888,
159
+ "rewards/question_recreation_reward_func/std": 0.005987111479043961,
160
  "rewards/soft_format_reward_func/mean": 0.0,
161
  "rewards/soft_format_reward_func/std": 0.0,
162
+ "rewards/strict_format_reward_func/mean": 0.0,
163
+ "rewards/strict_format_reward_func/std": 0.0,
164
+ "rewards/xmlcount_reward_func/mean": 0.0,
165
+ "rewards/xmlcount_reward_func/std": 0.0,
166
  "step": 8
167
  },
168
  {
 
172
  "clip_ratio/low_min": 0.0,
173
  "clip_ratio/region_mean": 0.0,
174
  "completions/clipped_ratio": 0.0,
175
+ "completions/max_length": 209.0,
176
+ "completions/max_terminated_length": 209.0,
177
+ "completions/mean_length": 181.625,
178
+ "completions/mean_terminated_length": 181.625,
179
+ "completions/min_length": 169.5,
180
+ "completions/min_terminated_length": 169.5,
181
+ "epoch": 4.8,
182
+ "grad_norm": 6.952495574951172,
183
+ "kl": 0.0149484759895131,
184
  "learning_rate": 3.1137137178519977e-07,
185
+ "loss": 0.0504,
186
+ "num_tokens": 17407.0,
187
+ "reward": 0.23214832320809364,
188
+ "reward_std": 0.37722403556108475,
189
+ "rewards/concensus_correctness_reward_func/mean": 0.0,
190
+ "rewards/concensus_correctness_reward_func/std": 0.0,
191
+ "rewards/consensus_reward_func/mean": 0.0,
192
+ "rewards/consensus_reward_func/std": 0.0,
193
  "rewards/cumulative_reward_2/mean": 0.0,
194
  "rewards/cumulative_reward_2/std": 0.0,
195
+ "rewards/final_correctness_reward_func/mean": 0.25,
196
+ "rewards/final_correctness_reward_func/std": 0.5,
197
+ "rewards/question_recreation_reward_func/mean": 0.015148311853408813,
198
+ "rewards/question_recreation_reward_func/std": 0.0038266002666205168,
199
  "rewards/soft_format_reward_func/mean": 0.0,
200
  "rewards/soft_format_reward_func/std": 0.0,
201
+ "rewards/strict_format_reward_func/mean": 0.0,
202
+ "rewards/strict_format_reward_func/std": 0.0,
203
+ "rewards/xmlcount_reward_func/mean": -0.03299999888986349,
204
+ "rewards/xmlcount_reward_func/std": 0.07268869318068027,
205
  "step": 10
206
  },
207
  {
 
210
  "clip_ratio/low_mean": 0.0,
211
  "clip_ratio/low_min": 0.0,
212
  "clip_ratio/region_mean": 0.0,
213
+ "completions/clipped_ratio": 0.125,
214
+ "completions/max_length": 301.0,
215
+ "completions/max_terminated_length": 158.0,
216
+ "completions/mean_length": 160.875,
217
+ "completions/mean_terminated_length": 114.16666412353516,
218
+ "completions/min_length": 89.0,
219
+ "completions/min_terminated_length": 89.0,
220
+ "epoch": 5.8,
221
+ "grad_norm": 16.89011573791504,
222
+ "kl": 0.017503770883195102,
223
  "learning_rate": 2.2935516363191693e-07,
224
+ "loss": 0.1,
225
+ "num_tokens": 20938.0,
226
+ "reward": -0.1888856142759323,
227
+ "reward_std": 0.33080266416072845,
228
+ "rewards/concensus_correctness_reward_func/mean": 0.0,
229
+ "rewards/concensus_correctness_reward_func/std": 0.0,
230
+ "rewards/consensus_reward_func/mean": 0.0,
231
+ "rewards/consensus_reward_func/std": 0.0,
232
  "rewards/cumulative_reward_2/mean": 0.0,
233
  "rewards/cumulative_reward_2/std": 0.0,
234
  "rewards/final_correctness_reward_func/mean": 0.0,
235
  "rewards/final_correctness_reward_func/std": 0.0,
236
+ "rewards/question_recreation_reward_func/mean": 0.015614384785294533,
237
+ "rewards/question_recreation_reward_func/std": 0.014846977777779102,
238
  "rewards/soft_format_reward_func/mean": 0.0,
239
  "rewards/soft_format_reward_func/std": 0.0,
240
+ "rewards/strict_format_reward_func/mean": 0.0,
241
+ "rewards/strict_format_reward_func/std": 0.0,
242
+ "rewards/xmlcount_reward_func/mean": -0.20449999906122684,
243
+ "rewards/xmlcount_reward_func/std": 0.4663179814815521,
244
  "step": 12
245
  },
246
  {
 
250
  "clip_ratio/low_min": 0.0,
251
  "clip_ratio/region_mean": 0.0,
252
  "completions/clipped_ratio": 0.0,
253
+ "completions/max_length": 128.5,
254
+ "completions/max_terminated_length": 128.5,
255
+ "completions/mean_length": 87.5,
256
+ "completions/mean_terminated_length": 87.5,
257
+ "completions/min_length": 30.0,
258
+ "completions/min_terminated_length": 30.0,
259
+ "epoch": 6.8,
260
+ "grad_norm": 20.018957138061523,
261
+ "kl": 0.025806593243032694,
262
  "learning_rate": 1.4957614383675767e-07,
263
+ "loss": -0.1021,
264
+ "num_tokens": 23935.0,
265
+ "reward": 0.05173949897289276,
266
+ "reward_std": 0.0109914755448699,
267
+ "rewards/concensus_correctness_reward_func/mean": 0.0,
268
+ "rewards/concensus_correctness_reward_func/std": 0.0,
269
+ "rewards/consensus_reward_func/mean": 0.0,
270
+ "rewards/consensus_reward_func/std": 0.0,
271
  "rewards/cumulative_reward_2/mean": 0.0,
272
  "rewards/cumulative_reward_2/std": 0.0,
273
  "rewards/final_correctness_reward_func/mean": 0.0,
274
  "rewards/final_correctness_reward_func/std": 0.0,
275
+ "rewards/question_recreation_reward_func/mean": 0.028114496264606714,
276
+ "rewards/question_recreation_reward_func/std": 0.025538412854075432,
277
  "rewards/soft_format_reward_func/mean": 0.0,
278
  "rewards/soft_format_reward_func/std": 0.0,
279
+ "rewards/strict_format_reward_func/mean": 0.0,
280
+ "rewards/strict_format_reward_func/std": 0.0,
281
+ "rewards/xmlcount_reward_func/mean": 0.02362500037997961,
282
+ "rewards/xmlcount_reward_func/std": 0.030027911067008972,
283
  "step": 14
284
  },
285
  {
 
288
  "clip_ratio/low_mean": 0.0,
289
  "clip_ratio/low_min": 0.0,
290
  "clip_ratio/region_mean": 0.0,
291
+ "completions/clipped_ratio": 0.0,
292
+ "completions/max_length": 225.5,
293
+ "completions/max_terminated_length": 225.5,
294
+ "completions/mean_length": 102.75,
295
+ "completions/mean_terminated_length": 102.75,
296
+ "completions/min_length": 47.0,
297
+ "completions/min_terminated_length": 47.0,
298
+ "epoch": 7.8,
299
+ "grad_norm": 18.695772171020508,
300
+ "kl": 0.02530490467324853,
301
  "learning_rate": 8.067960709356478e-08,
302
+ "loss": -0.0895,
303
+ "num_tokens": 27073.0,
304
+ "reward": -0.059220071882009506,
305
+ "reward_std": 0.15402239561080933,
306
+ "rewards/concensus_correctness_reward_func/mean": 0.0,
307
+ "rewards/concensus_correctness_reward_func/std": 0.0,
308
+ "rewards/consensus_reward_func/mean": 0.0,
309
+ "rewards/consensus_reward_func/std": 0.0,
310
  "rewards/cumulative_reward_2/mean": 0.0,
311
  "rewards/cumulative_reward_2/std": 0.0,
312
  "rewards/final_correctness_reward_func/mean": 0.0,
313
  "rewards/final_correctness_reward_func/std": 0.0,
314
+ "rewards/question_recreation_reward_func/mean": 0.01565493270754814,
315
+ "rewards/question_recreation_reward_func/std": 0.009137378772720695,
316
  "rewards/soft_format_reward_func/mean": 0.0,
317
  "rewards/soft_format_reward_func/std": 0.0,
318
+ "rewards/strict_format_reward_func/mean": 0.0,
319
+ "rewards/strict_format_reward_func/std": 0.0,
320
+ "rewards/xmlcount_reward_func/mean": -0.07487499713897705,
321
+ "rewards/xmlcount_reward_func/std": 0.15669719874858856,
322
  "step": 16
323
  },
324
  {
 
327
  "clip_ratio/low_mean": 0.0,
328
  "clip_ratio/low_min": 0.0,
329
  "clip_ratio/region_mean": 0.0,
330
+ "completions/clipped_ratio": 0.125,
331
+ "completions/max_length": 415.0,
332
+ "completions/max_terminated_length": 206.5,
333
+ "completions/mean_length": 241.25,
334
+ "completions/mean_terminated_length": 183.3333339691162,
335
+ "completions/min_length": 170.0,
336
+ "completions/min_terminated_length": 170.0,
337
+ "epoch": 8.8,
338
+ "grad_norm": 51.03651809692383,
339
+ "kl": 0.05000046588247642,
340
  "learning_rate": 3.013156219837776e-08,
341
+ "loss": -0.0637,
342
+ "num_tokens": 30146.0,
343
+ "reward": -0.015393424779176712,
344
+ "reward_std": 0.0688084177672863,
345
+ "rewards/concensus_correctness_reward_func/mean": 0.0,
346
+ "rewards/concensus_correctness_reward_func/std": 0.0,
347
+ "rewards/consensus_reward_func/mean": 0.0,
348
+ "rewards/consensus_reward_func/std": 0.0,
349
  "rewards/cumulative_reward_2/mean": 0.0,
350
  "rewards/cumulative_reward_2/std": 0.0,
351
  "rewards/final_correctness_reward_func/mean": 0.0,
352
  "rewards/final_correctness_reward_func/std": 0.0,
353
+ "rewards/question_recreation_reward_func/mean": 0.015356574673205614,
354
+ "rewards/question_recreation_reward_func/std": 0.013046635314822197,
355
  "rewards/soft_format_reward_func/mean": 0.0,
356
  "rewards/soft_format_reward_func/std": 0.0,
357
+ "rewards/strict_format_reward_func/mean": 0.0,
358
+ "rewards/strict_format_reward_func/std": 0.0,
359
+ "rewards/xmlcount_reward_func/mean": -0.03074999898672104,
360
+ "rewards/xmlcount_reward_func/std": 0.11776353418827057,
361
  "step": 18
362
  },
363
  {
 
366
  "clip_ratio/low_mean": 0.0,
367
  "clip_ratio/low_min": 0.0,
368
  "clip_ratio/region_mean": 0.0,
369
+ "completions/clipped_ratio": 0.125,
370
+ "completions/max_length": 265.5,
371
+ "completions/max_terminated_length": 56.5,
372
+ "completions/mean_length": 97.625,
373
+ "completions/mean_terminated_length": 41.66666793823242,
374
+ "completions/min_length": 34.0,
375
+ "completions/min_terminated_length": 34.0,
376
+ "epoch": 9.8,
377
+ "grad_norm": 14.597654342651367,
378
+ "kl": 0.02688464312814176,
379
  "learning_rate": 3.4096741493194193e-09,
380
+ "loss": 0.214,
381
+ "num_tokens": 33969.0,
382
+ "reward": -0.003884643316268921,
383
+ "reward_std": 0.04420278873294592,
384
+ "rewards/concensus_correctness_reward_func/mean": 0.0,
385
+ "rewards/concensus_correctness_reward_func/std": 0.0,
386
+ "rewards/consensus_reward_func/mean": 0.0,
387
+ "rewards/consensus_reward_func/std": 0.0,
388
  "rewards/cumulative_reward_2/mean": 0.0,
389
  "rewards/cumulative_reward_2/std": 0.0,
390
  "rewards/final_correctness_reward_func/mean": 0.0,
391
  "rewards/final_correctness_reward_func/std": 0.0,
392
+ "rewards/question_recreation_reward_func/mean": 0.020615354413166642,
393
+ "rewards/question_recreation_reward_func/std": 0.019722969736903906,
394
  "rewards/soft_format_reward_func/mean": 0.0,
395
  "rewards/soft_format_reward_func/std": 0.0,
396
+ "rewards/strict_format_reward_func/mean": 0.0,
397
+ "rewards/strict_format_reward_func/std": 0.0,
398
+ "rewards/xmlcount_reward_func/mean": -0.02449999935925007,
399
+ "rewards/xmlcount_reward_func/std": 0.04899999871850014,
400
  "step": 20
401
  },
402
  {
403
+ "epoch": 9.8,
404
  "step": 20,
405
  "total_flos": 0.0,
406
+ "train_loss": 0.037779451161623,
407
+ "train_runtime": 7169.9681,
408
+ "train_samples_per_second": 0.011,
409
+ "train_steps_per_second": 0.003
410
  }
411
  ],
412
  "logging_steps": 2,
413
  "max_steps": 20,
414
+ "num_input_tokens_seen": 33969,
415
+ "num_train_epochs": 10,
416
  "save_steps": 25,
417
  "stateful_callbacks": {
418
  "TrainerControl": {