MAGICYA0 commited on
Commit
2e7feee
·
verified ·
1 Parent(s): 669cfde

End of training

Browse files
Files changed (4) hide show
  1. all_results.json +3 -3
  2. model.safetensors +1 -1
  3. train_results.json +3 -3
  4. trainer_state.json +169 -169
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.06928728232160211,
4
- "train_runtime": 1855.3992,
5
  "train_samples": 5,
6
- "train_samples_per_second": 0.086,
7
  "train_steps_per_second": 0.011
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.089614911028184,
4
+ "train_runtime": 1740.8949,
5
  "train_samples": 5,
6
+ "train_samples_per_second": 0.092,
7
  "train_steps_per_second": 0.011
8
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:95bb083789701670cbe94151fb0f00fd1f922d7f1c1a6738a7e0f2e783ce15bf
3
  size 1976163472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25d1edacd0fbd8d8c1bfbae613a6179e41755b475c67376e997c751e09272a24
3
  size 1976163472
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.06928728232160211,
4
- "train_runtime": 1855.3992,
5
  "train_samples": 5,
6
- "train_samples_per_second": 0.086,
7
  "train_steps_per_second": 0.011
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.089614911028184,
4
+ "train_runtime": 1740.8949,
5
  "train_samples": 5,
6
+ "train_samples_per_second": 0.092,
7
  "train_steps_per_second": 0.011
8
  }
trainer_state.json CHANGED
@@ -17,29 +17,29 @@
17
  "clip_ratio/region_mean": 0.0,
18
  "completions/clipped_ratio": 0.5,
19
  "completions/max_length": 96.0,
20
- "completions/max_terminated_length": 53.0,
21
- "completions/mean_length": 71.0,
22
- "completions/mean_terminated_length": 46.16666603088379,
23
- "completions/min_length": 36.5,
24
- "completions/min_terminated_length": 36.5,
25
  "epoch": 1.8,
26
- "grad_norm": 13.032360076904297,
27
- "kl": -1.4287097838128204e-08,
28
  "learning_rate": 5e-07,
29
- "loss": 0.1108,
30
- "num_tokens": 2713.0,
31
- "reward": 0.12375914584845304,
32
- "reward_std": 0.028197589330375195,
33
  "rewards/concensus_correctness_reward_func/mean": 0.0,
34
  "rewards/concensus_correctness_reward_func/std": 0.0,
35
  "rewards/consensus_reward_func/mean": 0.0,
36
  "rewards/consensus_reward_func/std": 0.0,
37
  "rewards/cumulative_reward_2/mean": 0.0,
38
  "rewards/cumulative_reward_2/std": 0.0,
39
- "rewards/final_correctness_reward_func/mean": 0.0,
40
- "rewards/final_correctness_reward_func/std": 0.0,
41
- "rewards/question_recreation_reward_func/mean": 0.12375915423035622,
42
- "rewards/question_recreation_reward_func/std": 0.03597170067951083,
43
  "rewards/soft_format_reward_func/mean": 0.0,
44
  "rewards/soft_format_reward_func/std": 0.0,
45
  "rewards/strict_format_reward_func/mean": 0.0,
@@ -54,21 +54,21 @@
54
  "clip_ratio/low_mean": 0.0,
55
  "clip_ratio/low_min": 0.0,
56
  "clip_ratio/region_mean": 0.0,
57
- "completions/clipped_ratio": 0.7,
58
  "completions/max_length": 96.0,
59
- "completions/max_terminated_length": 54.5,
60
- "completions/mean_length": 79.20000076293945,
61
- "completions/mean_terminated_length": 43.5,
62
- "completions/min_length": 32.5,
63
- "completions/min_terminated_length": 32.5,
64
  "epoch": 3.8,
65
- "grad_norm": 10.995894432067871,
66
- "kl": 0.0007601117586091277,
67
  "learning_rate": 4.864543104251586e-07,
68
- "loss": -0.0212,
69
- "num_tokens": 5490.0,
70
- "reward": 0.027154644951224327,
71
- "reward_std": 0.008858403074555099,
72
  "rewards/concensus_correctness_reward_func/mean": 0.0,
73
  "rewards/concensus_correctness_reward_func/std": 0.0,
74
  "rewards/consensus_reward_func/mean": 0.0,
@@ -77,14 +77,14 @@
77
  "rewards/cumulative_reward_2/std": 0.0,
78
  "rewards/final_correctness_reward_func/mean": 0.0,
79
  "rewards/final_correctness_reward_func/std": 0.0,
80
- "rewards/question_recreation_reward_func/mean": 0.027154644951224327,
81
- "rewards/question_recreation_reward_func/std": 0.01477902289479971,
82
  "rewards/soft_format_reward_func/mean": 0.0,
83
  "rewards/soft_format_reward_func/std": 0.0,
84
  "rewards/strict_format_reward_func/mean": 0.0,
85
  "rewards/strict_format_reward_func/std": 0.0,
86
- "rewards/xmlcount_reward_func/mean": 0.0,
87
- "rewards/xmlcount_reward_func/std": 0.0,
88
  "step": 4
89
  },
90
  {
@@ -95,35 +95,35 @@
95
  "clip_ratio/region_mean": 0.0,
96
  "completions/clipped_ratio": 0.5,
97
  "completions/max_length": 96.0,
98
- "completions/max_terminated_length": 74.0,
99
- "completions/mean_length": 74.29999923706055,
100
- "completions/mean_terminated_length": 54.08333396911621,
101
- "completions/min_length": 34.0,
102
- "completions/min_terminated_length": 34.0,
103
  "epoch": 5.8,
104
- "grad_norm": 15.450472831726074,
105
- "kl": 0.005117912223795429,
106
  "learning_rate": 4.472851273490984e-07,
107
- "loss": -0.0534,
108
- "num_tokens": 8267.0,
109
- "reward": 0.03235254064202309,
110
- "reward_std": 0.010788492625579238,
111
  "rewards/concensus_correctness_reward_func/mean": 0.0,
112
  "rewards/concensus_correctness_reward_func/std": 0.0,
113
  "rewards/consensus_reward_func/mean": 0.0,
114
  "rewards/consensus_reward_func/std": 0.0,
115
  "rewards/cumulative_reward_2/mean": 0.0,
116
  "rewards/cumulative_reward_2/std": 0.0,
117
- "rewards/final_correctness_reward_func/mean": 0.0,
118
- "rewards/final_correctness_reward_func/std": 0.0,
119
- "rewards/question_recreation_reward_func/mean": 0.03235254157334566,
120
- "rewards/question_recreation_reward_func/std": 0.014937533531337976,
121
  "rewards/soft_format_reward_func/mean": 0.0,
122
  "rewards/soft_format_reward_func/std": 0.0,
123
  "rewards/strict_format_reward_func/mean": 0.0,
124
  "rewards/strict_format_reward_func/std": 0.0,
125
- "rewards/xmlcount_reward_func/mean": 0.0,
126
- "rewards/xmlcount_reward_func/std": 0.0,
127
  "step": 6
128
  },
129
  {
@@ -132,37 +132,37 @@
132
  "clip_ratio/low_mean": 0.0,
133
  "clip_ratio/low_min": 0.0,
134
  "clip_ratio/region_mean": 0.0,
135
- "completions/clipped_ratio": 0.3,
136
  "completions/max_length": 96.0,
137
- "completions/max_terminated_length": 88.0,
138
- "completions/mean_length": 60.39999961853027,
139
- "completions/mean_terminated_length": 47.125,
140
- "completions/min_length": 18.5,
141
- "completions/min_terminated_length": 18.5,
142
  "epoch": 7.8,
143
- "grad_norm": 16.364418029785156,
144
- "kl": 0.005086680190288462,
145
  "learning_rate": 3.8673703953060673e-07,
146
- "loss": 0.1469,
147
- "num_tokens": 10958.0,
148
- "reward": 0.024977175518870354,
149
- "reward_std": 0.005859495257027447,
150
  "rewards/concensus_correctness_reward_func/mean": 0.0,
151
  "rewards/concensus_correctness_reward_func/std": 0.0,
152
  "rewards/consensus_reward_func/mean": 0.0,
153
  "rewards/consensus_reward_func/std": 0.0,
154
  "rewards/cumulative_reward_2/mean": 0.0,
155
  "rewards/cumulative_reward_2/std": 0.0,
156
- "rewards/final_correctness_reward_func/mean": 0.0,
157
- "rewards/final_correctness_reward_func/std": 0.0,
158
- "rewards/question_recreation_reward_func/mean": 0.02497717458754778,
159
- "rewards/question_recreation_reward_func/std": 0.010923161637037992,
160
  "rewards/soft_format_reward_func/mean": 0.0,
161
  "rewards/soft_format_reward_func/std": 0.0,
162
  "rewards/strict_format_reward_func/mean": 0.0,
163
  "rewards/strict_format_reward_func/std": 0.0,
164
- "rewards/xmlcount_reward_func/mean": 0.0,
165
- "rewards/xmlcount_reward_func/std": 0.0,
166
  "step": 8
167
  },
168
  {
@@ -171,31 +171,31 @@
171
  "clip_ratio/low_mean": 0.0,
172
  "clip_ratio/low_min": 0.0,
173
  "clip_ratio/region_mean": 0.0,
174
- "completions/clipped_ratio": 0.39999999999999997,
175
  "completions/max_length": 96.0,
176
- "completions/max_terminated_length": 80.0,
177
- "completions/mean_length": 76.9000015258789,
178
- "completions/mean_terminated_length": 62.75,
179
- "completions/min_length": 36.0,
180
- "completions/min_terminated_length": 36.0,
181
  "epoch": 9.8,
182
- "grad_norm": 13.94202709197998,
183
- "kl": 0.012420143248164095,
184
  "learning_rate": 3.1137137178519977e-07,
185
- "loss": 0.1608,
186
- "num_tokens": 13634.0,
187
- "reward": 0.03278735093772411,
188
- "reward_std": 0.01219230075366795,
189
  "rewards/concensus_correctness_reward_func/mean": 0.0,
190
  "rewards/concensus_correctness_reward_func/std": 0.0,
191
  "rewards/consensus_reward_func/mean": 0.0,
192
  "rewards/consensus_reward_func/std": 0.0,
193
  "rewards/cumulative_reward_2/mean": 0.0,
194
  "rewards/cumulative_reward_2/std": 0.0,
195
- "rewards/final_correctness_reward_func/mean": 0.0,
196
- "rewards/final_correctness_reward_func/std": 0.0,
197
- "rewards/question_recreation_reward_func/mean": 0.03278735093772411,
198
- "rewards/question_recreation_reward_func/std": 0.01526944525539875,
199
  "rewards/soft_format_reward_func/mean": 0.0,
200
  "rewards/soft_format_reward_func/std": 0.0,
201
  "rewards/strict_format_reward_func/mean": 0.0,
@@ -210,31 +210,31 @@
210
  "clip_ratio/low_mean": 0.0,
211
  "clip_ratio/low_min": 0.0,
212
  "clip_ratio/region_mean": 0.0,
213
- "completions/clipped_ratio": 0.6000000000000001,
214
  "completions/max_length": 96.0,
215
- "completions/max_terminated_length": 45.5,
216
- "completions/mean_length": 78.70000076293945,
217
- "completions/mean_terminated_length": 38.5,
218
- "completions/min_length": 32.0,
219
- "completions/min_terminated_length": 32.0,
220
  "epoch": 11.8,
221
- "grad_norm": 12.199621200561523,
222
- "kl": 0.008919582382077351,
223
  "learning_rate": 2.2935516363191693e-07,
224
- "loss": 0.1325,
225
- "num_tokens": 16400.0,
226
- "reward": 0.11377206444740295,
227
- "reward_std": 0.035611885599792004,
228
  "rewards/concensus_correctness_reward_func/mean": 0.0,
229
  "rewards/concensus_correctness_reward_func/std": 0.0,
230
  "rewards/consensus_reward_func/mean": 0.0,
231
  "rewards/consensus_reward_func/std": 0.0,
232
  "rewards/cumulative_reward_2/mean": 0.0,
233
  "rewards/cumulative_reward_2/std": 0.0,
234
- "rewards/final_correctness_reward_func/mean": 0.0,
235
- "rewards/final_correctness_reward_func/std": 0.0,
236
- "rewards/question_recreation_reward_func/mean": 0.11377206444740295,
237
- "rewards/question_recreation_reward_func/std": 0.04100861307233572,
238
  "rewards/soft_format_reward_func/mean": 0.0,
239
  "rewards/soft_format_reward_func/std": 0.0,
240
  "rewards/strict_format_reward_func/mean": 0.0,
@@ -251,29 +251,29 @@
251
  "clip_ratio/region_mean": 0.0,
252
  "completions/clipped_ratio": 0.5,
253
  "completions/max_length": 96.0,
254
- "completions/max_terminated_length": 57.0,
255
- "completions/mean_length": 67.89999771118164,
256
- "completions/mean_terminated_length": 35.91666603088379,
257
- "completions/min_length": 20.0,
258
- "completions/min_terminated_length": 20.0,
259
  "epoch": 13.8,
260
- "grad_norm": 12.575211524963379,
261
- "kl": 0.01954469917109236,
262
  "learning_rate": 1.4957614383675767e-07,
263
- "loss": -0.012,
264
- "num_tokens": 18911.0,
265
- "reward": 0.03240520507097244,
266
- "reward_std": 0.017977031879127026,
267
  "rewards/concensus_correctness_reward_func/mean": 0.0,
268
  "rewards/concensus_correctness_reward_func/std": 0.0,
269
  "rewards/consensus_reward_func/mean": 0.0,
270
  "rewards/consensus_reward_func/std": 0.0,
271
  "rewards/cumulative_reward_2/mean": 0.0,
272
  "rewards/cumulative_reward_2/std": 0.0,
273
- "rewards/final_correctness_reward_func/mean": 0.0,
274
- "rewards/final_correctness_reward_func/std": 0.0,
275
- "rewards/question_recreation_reward_func/mean": 0.03240520413964987,
276
- "rewards/question_recreation_reward_func/std": 0.024350897409021854,
277
  "rewards/soft_format_reward_func/mean": 0.0,
278
  "rewards/soft_format_reward_func/std": 0.0,
279
  "rewards/strict_format_reward_func/mean": 0.0,
@@ -288,37 +288,37 @@
288
  "clip_ratio/low_mean": 0.0,
289
  "clip_ratio/low_min": 0.0,
290
  "clip_ratio/region_mean": 0.0,
291
- "completions/clipped_ratio": 0.19999999999999996,
292
  "completions/max_length": 96.0,
293
- "completions/max_terminated_length": 74.0,
294
- "completions/mean_length": 54.79999923706055,
295
- "completions/mean_terminated_length": 44.5,
296
- "completions/min_length": 21.5,
297
- "completions/min_terminated_length": 21.5,
298
  "epoch": 15.8,
299
- "grad_norm": 14.50256633758545,
300
- "kl": 0.011643254954833537,
301
  "learning_rate": 8.067960709356475e-08,
302
- "loss": -0.0041,
303
- "num_tokens": 21540.0,
304
- "reward": 0.12224335223436356,
305
- "reward_std": 0.029093343764543533,
306
  "rewards/concensus_correctness_reward_func/mean": 0.0,
307
  "rewards/concensus_correctness_reward_func/std": 0.0,
308
  "rewards/consensus_reward_func/mean": 0.0,
309
  "rewards/consensus_reward_func/std": 0.0,
310
  "rewards/cumulative_reward_2/mean": 0.0,
311
  "rewards/cumulative_reward_2/std": 0.0,
312
- "rewards/final_correctness_reward_func/mean": 0.0,
313
- "rewards/final_correctness_reward_func/std": 0.0,
314
- "rewards/question_recreation_reward_func/mean": 0.12224335223436356,
315
- "rewards/question_recreation_reward_func/std": 0.03663408011198044,
316
  "rewards/soft_format_reward_func/mean": 0.0,
317
  "rewards/soft_format_reward_func/std": 0.0,
318
  "rewards/strict_format_reward_func/mean": 0.0,
319
  "rewards/strict_format_reward_func/std": 0.0,
320
- "rewards/xmlcount_reward_func/mean": 0.0,
321
- "rewards/xmlcount_reward_func/std": 0.0,
322
  "step": 16
323
  },
324
  {
@@ -327,37 +327,37 @@
327
  "clip_ratio/low_mean": 0.0,
328
  "clip_ratio/low_min": 0.0,
329
  "clip_ratio/region_mean": 0.0,
330
- "completions/clipped_ratio": 0.19999999999999996,
331
  "completions/max_length": 96.0,
332
- "completions/max_terminated_length": 86.5,
333
- "completions/mean_length": 68.69999885559082,
334
- "completions/mean_terminated_length": 61.875,
335
- "completions/min_length": 36.5,
336
- "completions/min_terminated_length": 36.5,
337
  "epoch": 17.8,
338
- "grad_norm": 12.17991828918457,
339
- "kl": 0.013874782802304253,
340
  "learning_rate": 3.013156219837776e-08,
341
- "loss": 0.0575,
342
- "num_tokens": 24202.0,
343
- "reward": 0.1218071561306715,
344
- "reward_std": 0.023342850618064404,
345
  "rewards/concensus_correctness_reward_func/mean": 0.0,
346
  "rewards/concensus_correctness_reward_func/std": 0.0,
347
  "rewards/consensus_reward_func/mean": 0.0,
348
  "rewards/consensus_reward_func/std": 0.0,
349
  "rewards/cumulative_reward_2/mean": 0.0,
350
  "rewards/cumulative_reward_2/std": 0.0,
351
- "rewards/final_correctness_reward_func/mean": 0.0,
352
- "rewards/final_correctness_reward_func/std": 0.0,
353
- "rewards/question_recreation_reward_func/mean": 0.1218071486800909,
354
- "rewards/question_recreation_reward_func/std": 0.03349189879372716,
355
  "rewards/soft_format_reward_func/mean": 0.0,
356
  "rewards/soft_format_reward_func/std": 0.0,
357
  "rewards/strict_format_reward_func/mean": 0.0,
358
  "rewards/strict_format_reward_func/std": 0.0,
359
- "rewards/xmlcount_reward_func/mean": 0.0,
360
- "rewards/xmlcount_reward_func/std": 0.0,
361
  "step": 18
362
  },
363
  {
@@ -366,21 +366,21 @@
366
  "clip_ratio/low_mean": 0.0,
367
  "clip_ratio/low_min": 0.0,
368
  "clip_ratio/region_mean": 0.0,
369
- "completions/clipped_ratio": 0.6,
370
  "completions/max_length": 96.0,
371
- "completions/max_terminated_length": 60.0,
372
- "completions/mean_length": 74.0,
373
- "completions/mean_terminated_length": 41.0,
374
- "completions/min_length": 22.0,
375
- "completions/min_terminated_length": 22.0,
376
  "epoch": 19.8,
377
- "grad_norm": 11.600945472717285,
378
- "kl": 0.010468967841006815,
379
  "learning_rate": 3.4096741493194193e-09,
380
- "loss": 0.1749,
381
- "num_tokens": 26879.0,
382
- "reward": 0.15840027295053005,
383
- "reward_std": 0.19570728274993598,
384
  "rewards/concensus_correctness_reward_func/mean": 0.0,
385
  "rewards/concensus_correctness_reward_func/std": 0.0,
386
  "rewards/consensus_reward_func/mean": 0.0,
@@ -389,29 +389,29 @@
389
  "rewards/cumulative_reward_2/std": 0.0,
390
  "rewards/final_correctness_reward_func/mean": 0.125,
391
  "rewards/final_correctness_reward_func/std": 0.3535533845424652,
392
- "rewards/question_recreation_reward_func/mean": 0.023462770506739616,
393
- "rewards/question_recreation_reward_func/std": 0.008887386415153742,
394
  "rewards/soft_format_reward_func/mean": 0.0,
395
  "rewards/soft_format_reward_func/std": 0.0,
396
  "rewards/strict_format_reward_func/mean": 0.0,
397
  "rewards/strict_format_reward_func/std": 0.0,
398
- "rewards/xmlcount_reward_func/mean": 0.009937499649822712,
399
- "rewards/xmlcount_reward_func/std": 0.028107494115829468,
400
  "step": 20
401
  },
402
  {
403
  "epoch": 19.8,
404
  "step": 20,
405
  "total_flos": 0.0,
406
- "train_loss": 0.06928728232160211,
407
- "train_runtime": 1855.3992,
408
- "train_samples_per_second": 0.086,
409
  "train_steps_per_second": 0.011
410
  }
411
  ],
412
  "logging_steps": 2,
413
  "max_steps": 20,
414
- "num_input_tokens_seen": 26879,
415
  "num_train_epochs": 20,
416
  "save_steps": 25,
417
  "stateful_callbacks": {
 
17
  "clip_ratio/region_mean": 0.0,
18
  "completions/clipped_ratio": 0.5,
19
  "completions/max_length": 96.0,
20
+ "completions/max_terminated_length": 45.0,
21
+ "completions/mean_length": 60.00000190734863,
22
+ "completions/mean_terminated_length": 25.5,
23
+ "completions/min_length": 4.5,
24
+ "completions/min_terminated_length": 4.5,
25
  "epoch": 1.8,
26
+ "grad_norm": 14.51369857788086,
27
+ "kl": -1.2729463649829853e-08,
28
  "learning_rate": 5e-07,
29
+ "loss": -0.023,
30
+ "num_tokens": 2627.0,
31
+ "reward": 0.22892774641513824,
32
+ "reward_std": 0.21130400896072388,
33
  "rewards/concensus_correctness_reward_func/mean": 0.0,
34
  "rewards/concensus_correctness_reward_func/std": 0.0,
35
  "rewards/consensus_reward_func/mean": 0.0,
36
  "rewards/consensus_reward_func/std": 0.0,
37
  "rewards/cumulative_reward_2/mean": 0.0,
38
  "rewards/cumulative_reward_2/std": 0.0,
39
+ "rewards/final_correctness_reward_func/mean": 0.125,
40
+ "rewards/final_correctness_reward_func/std": 0.3535533845424652,
41
+ "rewards/question_recreation_reward_func/mean": 0.10392776224762201,
42
+ "rewards/question_recreation_reward_func/std": 0.04285131534561515,
43
  "rewards/soft_format_reward_func/mean": 0.0,
44
  "rewards/soft_format_reward_func/std": 0.0,
45
  "rewards/strict_format_reward_func/mean": 0.0,
 
54
  "clip_ratio/low_mean": 0.0,
55
  "clip_ratio/low_min": 0.0,
56
  "clip_ratio/region_mean": 0.0,
57
+ "completions/clipped_ratio": 0.9,
58
  "completions/max_length": 96.0,
59
+ "completions/max_terminated_length": 3.5,
60
+ "completions/mean_length": 87.0999984741211,
61
+ "completions/mean_terminated_length": 3.5,
62
+ "completions/min_length": 51.5,
63
+ "completions/min_terminated_length": 3.5,
64
  "epoch": 3.8,
65
+ "grad_norm": 9.493478775024414,
66
+ "kl": 0.001022911081690836,
67
  "learning_rate": 4.864543104251586e-07,
68
+ "loss": -0.0629,
69
+ "num_tokens": 5506.0,
70
+ "reward": 0.0016773659735918045,
71
+ "reward_std": 0.02505883341655135,
72
  "rewards/concensus_correctness_reward_func/mean": 0.0,
73
  "rewards/concensus_correctness_reward_func/std": 0.0,
74
  "rewards/consensus_reward_func/mean": 0.0,
 
77
  "rewards/cumulative_reward_2/std": 0.0,
78
  "rewards/final_correctness_reward_func/mean": 0.0,
79
  "rewards/final_correctness_reward_func/std": 0.0,
80
+ "rewards/question_recreation_reward_func/mean": 0.015302364714443684,
81
+ "rewards/question_recreation_reward_func/std": 0.010023764800280333,
82
  "rewards/soft_format_reward_func/mean": 0.0,
83
  "rewards/soft_format_reward_func/std": 0.0,
84
  "rewards/strict_format_reward_func/mean": 0.0,
85
  "rewards/strict_format_reward_func/std": 0.0,
86
+ "rewards/xmlcount_reward_func/mean": -0.013624999672174454,
87
+ "rewards/xmlcount_reward_func/std": 0.038537319749593735,
88
  "step": 4
89
  },
90
  {
 
95
  "clip_ratio/region_mean": 0.0,
96
  "completions/clipped_ratio": 0.5,
97
  "completions/max_length": 96.0,
98
+ "completions/max_terminated_length": 48.5,
99
+ "completions/mean_length": 62.70000076293945,
100
+ "completions/mean_terminated_length": 29.583333015441895,
101
+ "completions/min_length": 11.0,
102
+ "completions/min_terminated_length": 11.0,
103
  "epoch": 5.8,
104
+ "grad_norm": 13.253321647644043,
105
+ "kl": 0.007930153716188215,
106
  "learning_rate": 4.472851273490984e-07,
107
+ "loss": 0.3243,
108
+ "num_tokens": 8106.0,
109
+ "reward": 0.2996043562889099,
110
+ "reward_std": 0.32131071388721466,
111
  "rewards/concensus_correctness_reward_func/mean": 0.0,
112
  "rewards/concensus_correctness_reward_func/std": 0.0,
113
  "rewards/consensus_reward_func/mean": 0.0,
114
  "rewards/consensus_reward_func/std": 0.0,
115
  "rewards/cumulative_reward_2/mean": 0.0,
116
  "rewards/cumulative_reward_2/std": 0.0,
117
+ "rewards/final_correctness_reward_func/mean": 0.25,
118
+ "rewards/final_correctness_reward_func/std": 0.7071067690849304,
119
+ "rewards/question_recreation_reward_func/mean": 0.021166879683732986,
120
+ "rewards/question_recreation_reward_func/std": 0.015909616835415363,
121
  "rewards/soft_format_reward_func/mean": 0.0,
122
  "rewards/soft_format_reward_func/std": 0.0,
123
  "rewards/strict_format_reward_func/mean": 0.0,
124
  "rewards/strict_format_reward_func/std": 0.0,
125
+ "rewards/xmlcount_reward_func/mean": 0.02843749988824129,
126
+ "rewards/xmlcount_reward_func/std": 0.06866855919361115,
127
  "step": 6
128
  },
129
  {
 
132
  "clip_ratio/low_mean": 0.0,
133
  "clip_ratio/low_min": 0.0,
134
  "clip_ratio/region_mean": 0.0,
135
+ "completions/clipped_ratio": 0.7,
136
  "completions/max_length": 96.0,
137
+ "completions/max_terminated_length": 60.0,
138
+ "completions/mean_length": 84.89999771118164,
139
+ "completions/mean_terminated_length": 51.0,
140
+ "completions/min_length": 42.0,
141
+ "completions/min_terminated_length": 42.0,
142
  "epoch": 7.8,
143
+ "grad_norm": 10.583436965942383,
144
+ "kl": 0.057811224742181366,
145
  "learning_rate": 3.8673703953060673e-07,
146
+ "loss": 0.0703,
147
+ "num_tokens": 10879.0,
148
+ "reward": 0.27628672309219837,
149
+ "reward_std": 0.022481818683445454,
150
  "rewards/concensus_correctness_reward_func/mean": 0.0,
151
  "rewards/concensus_correctness_reward_func/std": 0.0,
152
  "rewards/consensus_reward_func/mean": 0.0,
153
  "rewards/consensus_reward_func/std": 0.0,
154
  "rewards/cumulative_reward_2/mean": 0.0,
155
  "rewards/cumulative_reward_2/std": 0.0,
156
+ "rewards/final_correctness_reward_func/mean": 0.25,
157
+ "rewards/final_correctness_reward_func/std": 0.4629100561141968,
158
+ "rewards/question_recreation_reward_func/mean": 0.01847423054277897,
159
+ "rewards/question_recreation_reward_func/std": 0.012820018921047449,
160
  "rewards/soft_format_reward_func/mean": 0.0,
161
  "rewards/soft_format_reward_func/std": 0.0,
162
  "rewards/strict_format_reward_func/mean": 0.0,
163
  "rewards/strict_format_reward_func/std": 0.0,
164
+ "rewards/xmlcount_reward_func/mean": 0.0078125,
165
+ "rewards/xmlcount_reward_func/std": 0.022097086533904076,
166
  "step": 8
167
  },
168
  {
 
171
  "clip_ratio/low_mean": 0.0,
172
  "clip_ratio/low_min": 0.0,
173
  "clip_ratio/region_mean": 0.0,
174
+ "completions/clipped_ratio": 0.5,
175
  "completions/max_length": 96.0,
176
+ "completions/max_terminated_length": 47.5,
177
+ "completions/mean_length": 71.70000076293945,
178
+ "completions/mean_terminated_length": 34.5,
179
+ "completions/min_length": 13.0,
180
+ "completions/min_terminated_length": 13.0,
181
  "epoch": 9.8,
182
+ "grad_norm": 14.598203659057617,
183
+ "kl": 0.00804947045253357,
184
  "learning_rate": 3.1137137178519977e-07,
185
+ "loss": 0.1708,
186
+ "num_tokens": 13520.0,
187
+ "reward": 0.2736130654811859,
188
+ "reward_std": 0.3600814640522003,
189
  "rewards/concensus_correctness_reward_func/mean": 0.0,
190
  "rewards/concensus_correctness_reward_func/std": 0.0,
191
  "rewards/consensus_reward_func/mean": 0.0,
192
  "rewards/consensus_reward_func/std": 0.0,
193
  "rewards/cumulative_reward_2/mean": 0.0,
194
  "rewards/cumulative_reward_2/std": 0.0,
195
+ "rewards/final_correctness_reward_func/mean": 0.25,
196
+ "rewards/final_correctness_reward_func/std": 0.7071067690849304,
197
+ "rewards/question_recreation_reward_func/mean": 0.02361306920647621,
198
+ "rewards/question_recreation_reward_func/std": 0.015552400611341,
199
  "rewards/soft_format_reward_func/mean": 0.0,
200
  "rewards/soft_format_reward_func/std": 0.0,
201
  "rewards/strict_format_reward_func/mean": 0.0,
 
210
  "clip_ratio/low_mean": 0.0,
211
  "clip_ratio/low_min": 0.0,
212
  "clip_ratio/region_mean": 0.0,
213
+ "completions/clipped_ratio": 0.5,
214
  "completions/max_length": 96.0,
215
+ "completions/max_terminated_length": 31.0,
216
+ "completions/mean_length": 57.60000038146973,
217
+ "completions/mean_terminated_length": 22.0,
218
+ "completions/min_length": 16.0,
219
+ "completions/min_terminated_length": 16.0,
220
  "epoch": 11.8,
221
+ "grad_norm": 23.344341278076172,
222
+ "kl": 0.13446828180894954,
223
  "learning_rate": 2.2935516363191693e-07,
224
+ "loss": 0.0719,
225
+ "num_tokens": 15932.0,
226
+ "reward": 0.4726976007223129,
227
+ "reward_std": 0.19187587685883045,
228
  "rewards/concensus_correctness_reward_func/mean": 0.0,
229
  "rewards/concensus_correctness_reward_func/std": 0.0,
230
  "rewards/consensus_reward_func/mean": 0.0,
231
  "rewards/consensus_reward_func/std": 0.0,
232
  "rewards/cumulative_reward_2/mean": 0.0,
233
  "rewards/cumulative_reward_2/std": 0.0,
234
+ "rewards/final_correctness_reward_func/mean": 0.375,
235
+ "rewards/final_correctness_reward_func/std": 0.816463440656662,
236
+ "rewards/question_recreation_reward_func/mean": 0.09769760258495808,
237
+ "rewards/question_recreation_reward_func/std": 0.06193104526028037,
238
  "rewards/soft_format_reward_func/mean": 0.0,
239
  "rewards/soft_format_reward_func/std": 0.0,
240
  "rewards/strict_format_reward_func/mean": 0.0,
 
251
  "clip_ratio/region_mean": 0.0,
252
  "completions/clipped_ratio": 0.5,
253
  "completions/max_length": 96.0,
254
+ "completions/max_terminated_length": 25.0,
255
+ "completions/mean_length": 56.39999771118164,
256
+ "completions/mean_terminated_length": 11.25,
257
+ "completions/min_length": 2.0,
258
+ "completions/min_terminated_length": 2.0,
259
  "epoch": 13.8,
260
+ "grad_norm": 17.271100997924805,
261
+ "kl": 0.08348440888221376,
262
  "learning_rate": 1.4957614383675767e-07,
263
+ "loss": 0.1353,
264
+ "num_tokens": 18389.0,
265
+ "reward": 0.39730143547058105,
266
+ "reward_std": 0.18547067046165466,
267
  "rewards/concensus_correctness_reward_func/mean": 0.0,
268
  "rewards/concensus_correctness_reward_func/std": 0.0,
269
  "rewards/consensus_reward_func/mean": 0.0,
270
  "rewards/consensus_reward_func/std": 0.0,
271
  "rewards/cumulative_reward_2/mean": 0.0,
272
  "rewards/cumulative_reward_2/std": 0.0,
273
+ "rewards/final_correctness_reward_func/mean": 0.375,
274
+ "rewards/final_correctness_reward_func/std": 0.816463440656662,
275
+ "rewards/question_recreation_reward_func/mean": 0.022301463410258293,
276
+ "rewards/question_recreation_reward_func/std": 0.016719398088753223,
277
  "rewards/soft_format_reward_func/mean": 0.0,
278
  "rewards/soft_format_reward_func/std": 0.0,
279
  "rewards/strict_format_reward_func/mean": 0.0,
 
288
  "clip_ratio/low_mean": 0.0,
289
  "clip_ratio/low_min": 0.0,
290
  "clip_ratio/region_mean": 0.0,
291
+ "completions/clipped_ratio": 0.3,
292
  "completions/max_length": 96.0,
293
+ "completions/max_terminated_length": 72.5,
294
+ "completions/mean_length": 52.39999961853027,
295
+ "completions/mean_terminated_length": 36.29166603088379,
296
+ "completions/min_length": 13.0,
297
+ "completions/min_terminated_length": 13.0,
298
  "epoch": 15.8,
299
+ "grad_norm": 17.390409469604492,
300
+ "kl": 0.1617226421367377,
301
  "learning_rate": 8.067960709356475e-08,
302
+ "loss": 0.0343,
303
+ "num_tokens": 20644.0,
304
+ "reward": 0.6499089896678925,
305
+ "reward_std": 0.1050245389342308,
306
  "rewards/concensus_correctness_reward_func/mean": 0.0,
307
  "rewards/concensus_correctness_reward_func/std": 0.0,
308
  "rewards/consensus_reward_func/mean": 0.0,
309
  "rewards/consensus_reward_func/std": 0.0,
310
  "rewards/cumulative_reward_2/mean": 0.0,
311
  "rewards/cumulative_reward_2/std": 0.0,
312
+ "rewards/final_correctness_reward_func/mean": 0.5,
313
+ "rewards/final_correctness_reward_func/std": 0.9258201122283936,
314
+ "rewards/question_recreation_reward_func/mean": 0.08803398394957185,
315
+ "rewards/question_recreation_reward_func/std": 0.0536980046890676,
316
  "rewards/soft_format_reward_func/mean": 0.0,
317
  "rewards/soft_format_reward_func/std": 0.0,
318
  "rewards/strict_format_reward_func/mean": 0.0,
319
  "rewards/strict_format_reward_func/std": 0.0,
320
+ "rewards/xmlcount_reward_func/mean": 0.0618749987334013,
321
+ "rewards/xmlcount_reward_func/std": 0.17500893026590347,
322
  "step": 16
323
  },
324
  {
 
327
  "clip_ratio/low_mean": 0.0,
328
  "clip_ratio/low_min": 0.0,
329
  "clip_ratio/region_mean": 0.0,
330
+ "completions/clipped_ratio": 0.39999999999999997,
331
  "completions/max_length": 96.0,
332
+ "completions/max_terminated_length": 58.5,
333
+ "completions/mean_length": 62.39999961853027,
334
+ "completions/mean_terminated_length": 40.875,
335
+ "completions/min_length": 23.5,
336
+ "completions/min_terminated_length": 23.5,
337
  "epoch": 17.8,
338
+ "grad_norm": 13.08644962310791,
339
+ "kl": 0.08081291774578858,
340
  "learning_rate": 3.013156219837776e-08,
341
+ "loss": -0.0031,
342
+ "num_tokens": 23161.0,
343
+ "reward": 0.35994406789541245,
344
+ "reward_std": 0.03296606941148639,
345
  "rewards/concensus_correctness_reward_func/mean": 0.0,
346
  "rewards/concensus_correctness_reward_func/std": 0.0,
347
  "rewards/consensus_reward_func/mean": 0.0,
348
  "rewards/consensus_reward_func/std": 0.0,
349
  "rewards/cumulative_reward_2/mean": 0.0,
350
  "rewards/cumulative_reward_2/std": 0.0,
351
+ "rewards/final_correctness_reward_func/mean": 0.25,
352
+ "rewards/final_correctness_reward_func/std": 0.4629100561141968,
353
+ "rewards/question_recreation_reward_func/mean": 0.11738158855587244,
354
+ "rewards/question_recreation_reward_func/std": 0.029859440866857767,
355
  "rewards/soft_format_reward_func/mean": 0.0,
356
  "rewards/soft_format_reward_func/std": 0.0,
357
  "rewards/strict_format_reward_func/mean": 0.0,
358
  "rewards/strict_format_reward_func/std": 0.0,
359
+ "rewards/xmlcount_reward_func/mean": -0.007437500171363354,
360
+ "rewards/xmlcount_reward_func/std": 0.021036427468061447,
361
  "step": 18
362
  },
363
  {
 
366
  "clip_ratio/low_mean": 0.0,
367
  "clip_ratio/low_min": 0.0,
368
  "clip_ratio/region_mean": 0.0,
369
+ "completions/clipped_ratio": 0.3,
370
  "completions/max_length": 96.0,
371
+ "completions/max_terminated_length": 70.0,
372
+ "completions/mean_length": 56.0,
373
+ "completions/mean_terminated_length": 39.33333396911621,
374
+ "completions/min_length": 8.5,
375
+ "completions/min_terminated_length": 8.5,
376
  "epoch": 19.8,
377
+ "grad_norm": 15.382369995117188,
378
+ "kl": 0.02692569710779935,
379
  "learning_rate": 3.4096741493194193e-09,
380
+ "loss": 0.1782,
381
+ "num_tokens": 25599.0,
382
+ "reward": 0.16119842790067196,
383
+ "reward_std": 0.1692181215621531,
384
  "rewards/concensus_correctness_reward_func/mean": 0.0,
385
  "rewards/concensus_correctness_reward_func/std": 0.0,
386
  "rewards/consensus_reward_func/mean": 0.0,
 
389
  "rewards/cumulative_reward_2/std": 0.0,
390
  "rewards/final_correctness_reward_func/mean": 0.125,
391
  "rewards/final_correctness_reward_func/std": 0.3535533845424652,
392
+ "rewards/question_recreation_reward_func/mean": 0.028385925106704235,
393
+ "rewards/question_recreation_reward_func/std": 0.014382507652044296,
394
  "rewards/soft_format_reward_func/mean": 0.0,
395
  "rewards/soft_format_reward_func/std": 0.0,
396
  "rewards/strict_format_reward_func/mean": 0.0,
397
  "rewards/strict_format_reward_func/std": 0.0,
398
+ "rewards/xmlcount_reward_func/mean": 0.0078125,
399
+ "rewards/xmlcount_reward_func/std": 0.022097086533904076,
400
  "step": 20
401
  },
402
  {
403
  "epoch": 19.8,
404
  "step": 20,
405
  "total_flos": 0.0,
406
+ "train_loss": 0.089614911028184,
407
+ "train_runtime": 1740.8949,
408
+ "train_samples_per_second": 0.092,
409
  "train_steps_per_second": 0.011
410
  }
411
  ],
412
  "logging_steps": 2,
413
  "max_steps": 20,
414
+ "num_input_tokens_seen": 25599,
415
  "num_train_epochs": 20,
416
  "save_steps": 25,
417
  "stateful_callbacks": {