Avokado777 commited on
Commit
90c257b
·
verified ·
1 Parent(s): d3daf77

End of training

Browse files
README.md CHANGED
@@ -30,7 +30,7 @@ print(output["generated_text"])
30
 
31
  ## Training procedure
32
 
33
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/corobov-mitya-individual/huggingface/runs/an7qp0tx)
34
 
35
 
36
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
 
30
 
31
  ## Training procedure
32
 
33
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/corobov-mitya-individual/huggingface/runs/zcdsijaj)
34
 
35
 
36
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.037779451161623,
4
- "train_runtime": 7169.9681,
5
- "train_samples": 5,
6
- "train_samples_per_second": 0.011,
7
- "train_steps_per_second": 0.003
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": -0.03877150900661945,
4
+ "train_runtime": 4996.2526,
5
+ "train_samples": 3,
6
+ "train_samples_per_second": 0.016,
7
+ "train_steps_per_second": 0.004
8
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:adecc22f303850912efbf20277ad42ab67f6089c6d70133581f073f6e2538487
3
  size 1976163472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cdb70c227c8abf2dab49c2c098a88d98e158211501b71354a6d982fdb85fc70c
3
  size 1976163472
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.037779451161623,
4
- "train_runtime": 7169.9681,
5
- "train_samples": 5,
6
- "train_samples_per_second": 0.011,
7
- "train_steps_per_second": 0.003
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": -0.03877150900661945,
4
+ "train_runtime": 4996.2526,
5
+ "train_samples": 3,
6
+ "train_samples_per_second": 0.016,
7
+ "train_steps_per_second": 0.004
8
  }
trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 9.8,
6
  "eval_steps": 500,
7
  "global_step": 20,
8
  "is_hyper_param_search": false,
@@ -15,37 +15,37 @@
15
  "clip_ratio/low_mean": 0.0,
16
  "clip_ratio/low_min": 0.0,
17
  "clip_ratio/region_mean": 0.0,
18
- "completions/clipped_ratio": 0.125,
19
- "completions/max_length": 302.0,
20
- "completions/max_terminated_length": 92.5,
21
- "completions/mean_length": 123.375,
22
- "completions/mean_terminated_length": 63.83333396911621,
23
- "completions/min_length": 48.0,
24
- "completions/min_terminated_length": 48.0,
25
- "epoch": 0.8,
26
- "grad_norm": 8.92754077911377,
27
  "kl": 0.0,
28
  "learning_rate": 5e-07,
29
- "loss": -0.1218,
30
- "num_tokens": 3214.0,
31
- "reward": 0.3856615126132965,
32
- "reward_std": 0.7816920205950737,
33
- "rewards/concensus_correctness_reward_func/mean": 0.4375,
34
- "rewards/concensus_correctness_reward_func/std": 0.875,
35
- "rewards/consensus_reward_func/mean": 0.0,
36
- "rewards/consensus_reward_func/std": 0.0,
37
  "rewards/cumulative_reward_2/mean": 0.0,
38
  "rewards/cumulative_reward_2/std": 0.0,
39
  "rewards/final_correctness_reward_func/mean": 0.0,
40
  "rewards/final_correctness_reward_func/std": 0.0,
41
- "rewards/question_recreation_reward_func/mean": 0.02203650400042534,
42
- "rewards/question_recreation_reward_func/std": 0.023317964747548103,
43
  "rewards/soft_format_reward_func/mean": 0.0,
44
  "rewards/soft_format_reward_func/std": 0.0,
45
  "rewards/strict_format_reward_func/mean": 0.0,
46
  "rewards/strict_format_reward_func/std": 0.0,
47
- "rewards/xmlcount_reward_func/mean": -0.07387500256299973,
48
- "rewards/xmlcount_reward_func/std": 0.269045926630497,
49
  "step": 2
50
  },
51
  {
@@ -54,37 +54,37 @@
54
  "clip_ratio/low_mean": 0.0,
55
  "clip_ratio/low_min": 0.0,
56
  "clip_ratio/region_mean": 0.0,
57
- "completions/clipped_ratio": 0.0,
58
- "completions/max_length": 204.5,
59
- "completions/max_terminated_length": 204.5,
60
- "completions/mean_length": 108.875,
61
- "completions/mean_terminated_length": 108.875,
62
- "completions/min_length": 34.0,
63
- "completions/min_terminated_length": 34.0,
64
- "epoch": 1.8,
65
- "grad_norm": 14.86384391784668,
66
- "kl": 0.001416065962985158,
67
  "learning_rate": 4.864543104251586e-07,
68
- "loss": 0.3279,
69
- "num_tokens": 6661.0,
70
- "reward": 0.022667515091598034,
71
- "reward_std": 0.010929046286037192,
72
- "rewards/concensus_correctness_reward_func/mean": 0.0,
73
- "rewards/concensus_correctness_reward_func/std": 0.0,
74
- "rewards/consensus_reward_func/mean": 0.0,
75
- "rewards/consensus_reward_func/std": 0.0,
76
  "rewards/cumulative_reward_2/mean": 0.0,
77
  "rewards/cumulative_reward_2/std": 0.0,
78
  "rewards/final_correctness_reward_func/mean": 0.0,
79
  "rewards/final_correctness_reward_func/std": 0.0,
80
- "rewards/question_recreation_reward_func/mean": 0.015667515341192484,
81
- "rewards/question_recreation_reward_func/std": 0.010519207920879126,
82
  "rewards/soft_format_reward_func/mean": 0.0,
83
  "rewards/soft_format_reward_func/std": 0.0,
84
  "rewards/strict_format_reward_func/mean": 0.0,
85
  "rewards/strict_format_reward_func/std": 0.0,
86
- "rewards/xmlcount_reward_func/mean": 0.007000000216066837,
87
- "rewards/xmlcount_reward_func/std": 0.01400000136345625,
88
  "step": 4
89
  },
90
  {
@@ -93,21 +93,21 @@
93
  "clip_ratio/low_mean": 0.0,
94
  "clip_ratio/low_min": 0.0,
95
  "clip_ratio/region_mean": 0.0,
96
- "completions/clipped_ratio": 0.5,
97
- "completions/max_length": 412.0,
98
- "completions/max_terminated_length": 156.0,
99
- "completions/mean_length": 316.125,
100
- "completions/mean_terminated_length": 60.125,
101
- "completions/min_length": 270.5,
102
- "completions/min_terminated_length": 14.5,
103
- "epoch": 2.8,
104
- "grad_norm": 7.786389350891113,
105
- "kl": 0.0026882924139499664,
106
  "learning_rate": 4.472851273490984e-07,
107
- "loss": 0.178,
108
- "num_tokens": 10480.0,
109
- "reward": 0.030308596324175596,
110
- "reward_std": 0.0613415464758873,
111
  "rewards/concensus_correctness_reward_func/mean": 0.0,
112
  "rewards/concensus_correctness_reward_func/std": 0.0,
113
  "rewards/consensus_reward_func/mean": 0.0,
@@ -116,14 +116,14 @@
116
  "rewards/cumulative_reward_2/std": 0.0,
117
  "rewards/final_correctness_reward_func/mean": 0.0,
118
  "rewards/final_correctness_reward_func/std": 0.0,
119
- "rewards/question_recreation_reward_func/mean": 0.013933598063886166,
120
- "rewards/question_recreation_reward_func/std": 0.014677805360406637,
121
  "rewards/soft_format_reward_func/mean": 0.0,
122
  "rewards/soft_format_reward_func/std": 0.0,
123
  "rewards/strict_format_reward_func/mean": 0.0,
124
  "rewards/strict_format_reward_func/std": 0.0,
125
- "rewards/xmlcount_reward_func/mean": 0.016375000588595867,
126
- "rewards/xmlcount_reward_func/std": 0.051155371591448784,
127
  "step": 6
128
  },
129
  {
@@ -132,21 +132,21 @@
132
  "clip_ratio/low_mean": 0.0,
133
  "clip_ratio/low_min": 0.0,
134
  "clip_ratio/region_mean": 0.0,
135
- "completions/clipped_ratio": 0.5,
136
- "completions/max_length": 447.5,
137
- "completions/max_terminated_length": 191.5,
138
- "completions/mean_length": 309.25,
139
- "completions/mean_terminated_length": 53.25,
140
- "completions/min_length": 258.0,
141
- "completions/min_terminated_length": 2.0,
142
- "epoch": 3.8,
143
- "grad_norm": 5.402748107910156,
144
- "kl": 0.005952609681116883,
145
  "learning_rate": 3.867370395306068e-07,
146
- "loss": -0.1153,
147
- "num_tokens": 14444.0,
148
- "reward": 0.01363831665366888,
149
- "reward_std": 0.0067086233757436275,
150
  "rewards/concensus_correctness_reward_func/mean": 0.0,
151
  "rewards/concensus_correctness_reward_func/std": 0.0,
152
  "rewards/consensus_reward_func/mean": 0.0,
@@ -155,14 +155,14 @@
155
  "rewards/cumulative_reward_2/std": 0.0,
156
  "rewards/final_correctness_reward_func/mean": 0.0,
157
  "rewards/final_correctness_reward_func/std": 0.0,
158
- "rewards/question_recreation_reward_func/mean": 0.01363831665366888,
159
- "rewards/question_recreation_reward_func/std": 0.005987111479043961,
160
  "rewards/soft_format_reward_func/mean": 0.0,
161
  "rewards/soft_format_reward_func/std": 0.0,
162
  "rewards/strict_format_reward_func/mean": 0.0,
163
  "rewards/strict_format_reward_func/std": 0.0,
164
- "rewards/xmlcount_reward_func/mean": 0.0,
165
- "rewards/xmlcount_reward_func/std": 0.0,
166
  "step": 8
167
  },
168
  {
@@ -172,36 +172,36 @@
172
  "clip_ratio/low_min": 0.0,
173
  "clip_ratio/region_mean": 0.0,
174
  "completions/clipped_ratio": 0.0,
175
- "completions/max_length": 209.0,
176
- "completions/max_terminated_length": 209.0,
177
- "completions/mean_length": 181.625,
178
- "completions/mean_terminated_length": 181.625,
179
- "completions/min_length": 169.5,
180
- "completions/min_terminated_length": 169.5,
181
- "epoch": 4.8,
182
- "grad_norm": 6.952495574951172,
183
- "kl": 0.0149484759895131,
184
  "learning_rate": 3.1137137178519977e-07,
185
- "loss": 0.0504,
186
- "num_tokens": 17407.0,
187
- "reward": 0.23214832320809364,
188
- "reward_std": 0.37722403556108475,
189
  "rewards/concensus_correctness_reward_func/mean": 0.0,
190
  "rewards/concensus_correctness_reward_func/std": 0.0,
191
  "rewards/consensus_reward_func/mean": 0.0,
192
  "rewards/consensus_reward_func/std": 0.0,
193
  "rewards/cumulative_reward_2/mean": 0.0,
194
  "rewards/cumulative_reward_2/std": 0.0,
195
- "rewards/final_correctness_reward_func/mean": 0.25,
196
- "rewards/final_correctness_reward_func/std": 0.5,
197
- "rewards/question_recreation_reward_func/mean": 0.015148311853408813,
198
- "rewards/question_recreation_reward_func/std": 0.0038266002666205168,
199
  "rewards/soft_format_reward_func/mean": 0.0,
200
  "rewards/soft_format_reward_func/std": 0.0,
201
  "rewards/strict_format_reward_func/mean": 0.0,
202
  "rewards/strict_format_reward_func/std": 0.0,
203
- "rewards/xmlcount_reward_func/mean": -0.03299999888986349,
204
- "rewards/xmlcount_reward_func/std": 0.07268869318068027,
205
  "step": 10
206
  },
207
  {
@@ -210,21 +210,21 @@
210
  "clip_ratio/low_mean": 0.0,
211
  "clip_ratio/low_min": 0.0,
212
  "clip_ratio/region_mean": 0.0,
213
- "completions/clipped_ratio": 0.125,
214
- "completions/max_length": 301.0,
215
- "completions/max_terminated_length": 158.0,
216
- "completions/mean_length": 160.875,
217
- "completions/mean_terminated_length": 114.16666412353516,
218
- "completions/min_length": 89.0,
219
- "completions/min_terminated_length": 89.0,
220
- "epoch": 5.8,
221
- "grad_norm": 16.89011573791504,
222
- "kl": 0.017503770883195102,
223
  "learning_rate": 2.2935516363191693e-07,
224
- "loss": 0.1,
225
- "num_tokens": 20938.0,
226
- "reward": -0.1888856142759323,
227
- "reward_std": 0.33080266416072845,
228
  "rewards/concensus_correctness_reward_func/mean": 0.0,
229
  "rewards/concensus_correctness_reward_func/std": 0.0,
230
  "rewards/consensus_reward_func/mean": 0.0,
@@ -233,14 +233,14 @@
233
  "rewards/cumulative_reward_2/std": 0.0,
234
  "rewards/final_correctness_reward_func/mean": 0.0,
235
  "rewards/final_correctness_reward_func/std": 0.0,
236
- "rewards/question_recreation_reward_func/mean": 0.015614384785294533,
237
- "rewards/question_recreation_reward_func/std": 0.014846977777779102,
238
  "rewards/soft_format_reward_func/mean": 0.0,
239
  "rewards/soft_format_reward_func/std": 0.0,
240
  "rewards/strict_format_reward_func/mean": 0.0,
241
  "rewards/strict_format_reward_func/std": 0.0,
242
- "rewards/xmlcount_reward_func/mean": -0.20449999906122684,
243
- "rewards/xmlcount_reward_func/std": 0.4663179814815521,
244
  "step": 12
245
  },
246
  {
@@ -250,36 +250,36 @@
250
  "clip_ratio/low_min": 0.0,
251
  "clip_ratio/region_mean": 0.0,
252
  "completions/clipped_ratio": 0.0,
253
- "completions/max_length": 128.5,
254
- "completions/max_terminated_length": 128.5,
255
- "completions/mean_length": 87.5,
256
- "completions/mean_terminated_length": 87.5,
257
- "completions/min_length": 30.0,
258
- "completions/min_terminated_length": 30.0,
259
- "epoch": 6.8,
260
- "grad_norm": 20.018957138061523,
261
- "kl": 0.025806593243032694,
262
  "learning_rate": 1.4957614383675767e-07,
263
- "loss": -0.1021,
264
- "num_tokens": 23935.0,
265
- "reward": 0.05173949897289276,
266
- "reward_std": 0.0109914755448699,
267
- "rewards/concensus_correctness_reward_func/mean": 0.0,
268
- "rewards/concensus_correctness_reward_func/std": 0.0,
269
- "rewards/consensus_reward_func/mean": 0.0,
270
- "rewards/consensus_reward_func/std": 0.0,
271
  "rewards/cumulative_reward_2/mean": 0.0,
272
  "rewards/cumulative_reward_2/std": 0.0,
273
  "rewards/final_correctness_reward_func/mean": 0.0,
274
  "rewards/final_correctness_reward_func/std": 0.0,
275
- "rewards/question_recreation_reward_func/mean": 0.028114496264606714,
276
- "rewards/question_recreation_reward_func/std": 0.025538412854075432,
277
  "rewards/soft_format_reward_func/mean": 0.0,
278
  "rewards/soft_format_reward_func/std": 0.0,
279
  "rewards/strict_format_reward_func/mean": 0.0,
280
  "rewards/strict_format_reward_func/std": 0.0,
281
- "rewards/xmlcount_reward_func/mean": 0.02362500037997961,
282
- "rewards/xmlcount_reward_func/std": 0.030027911067008972,
283
  "step": 14
284
  },
285
  {
@@ -288,21 +288,21 @@
288
  "clip_ratio/low_mean": 0.0,
289
  "clip_ratio/low_min": 0.0,
290
  "clip_ratio/region_mean": 0.0,
291
- "completions/clipped_ratio": 0.0,
292
- "completions/max_length": 225.5,
293
- "completions/max_terminated_length": 225.5,
294
- "completions/mean_length": 102.75,
295
- "completions/mean_terminated_length": 102.75,
296
- "completions/min_length": 47.0,
297
- "completions/min_terminated_length": 47.0,
298
- "epoch": 7.8,
299
- "grad_norm": 18.695772171020508,
300
- "kl": 0.02530490467324853,
301
  "learning_rate": 8.067960709356478e-08,
302
- "loss": -0.0895,
303
- "num_tokens": 27073.0,
304
- "reward": -0.059220071882009506,
305
- "reward_std": 0.15402239561080933,
306
  "rewards/concensus_correctness_reward_func/mean": 0.0,
307
  "rewards/concensus_correctness_reward_func/std": 0.0,
308
  "rewards/consensus_reward_func/mean": 0.0,
@@ -311,14 +311,14 @@
311
  "rewards/cumulative_reward_2/std": 0.0,
312
  "rewards/final_correctness_reward_func/mean": 0.0,
313
  "rewards/final_correctness_reward_func/std": 0.0,
314
- "rewards/question_recreation_reward_func/mean": 0.01565493270754814,
315
- "rewards/question_recreation_reward_func/std": 0.009137378772720695,
316
  "rewards/soft_format_reward_func/mean": 0.0,
317
  "rewards/soft_format_reward_func/std": 0.0,
318
  "rewards/strict_format_reward_func/mean": 0.0,
319
  "rewards/strict_format_reward_func/std": 0.0,
320
- "rewards/xmlcount_reward_func/mean": -0.07487499713897705,
321
- "rewards/xmlcount_reward_func/std": 0.15669719874858856,
322
  "step": 16
323
  },
324
  {
@@ -327,37 +327,37 @@
327
  "clip_ratio/low_mean": 0.0,
328
  "clip_ratio/low_min": 0.0,
329
  "clip_ratio/region_mean": 0.0,
330
- "completions/clipped_ratio": 0.125,
331
- "completions/max_length": 415.0,
332
- "completions/max_terminated_length": 206.5,
333
- "completions/mean_length": 241.25,
334
- "completions/mean_terminated_length": 183.3333339691162,
335
- "completions/min_length": 170.0,
336
- "completions/min_terminated_length": 170.0,
337
- "epoch": 8.8,
338
- "grad_norm": 51.03651809692383,
339
- "kl": 0.05000046588247642,
340
  "learning_rate": 3.013156219837776e-08,
341
- "loss": -0.0637,
342
- "num_tokens": 30146.0,
343
- "reward": -0.015393424779176712,
344
- "reward_std": 0.0688084177672863,
345
- "rewards/concensus_correctness_reward_func/mean": 0.0,
346
- "rewards/concensus_correctness_reward_func/std": 0.0,
347
- "rewards/consensus_reward_func/mean": 0.0,
348
- "rewards/consensus_reward_func/std": 0.0,
349
  "rewards/cumulative_reward_2/mean": 0.0,
350
  "rewards/cumulative_reward_2/std": 0.0,
351
  "rewards/final_correctness_reward_func/mean": 0.0,
352
  "rewards/final_correctness_reward_func/std": 0.0,
353
- "rewards/question_recreation_reward_func/mean": 0.015356574673205614,
354
- "rewards/question_recreation_reward_func/std": 0.013046635314822197,
355
  "rewards/soft_format_reward_func/mean": 0.0,
356
  "rewards/soft_format_reward_func/std": 0.0,
357
  "rewards/strict_format_reward_func/mean": 0.0,
358
  "rewards/strict_format_reward_func/std": 0.0,
359
- "rewards/xmlcount_reward_func/mean": -0.03074999898672104,
360
- "rewards/xmlcount_reward_func/std": 0.11776353418827057,
361
  "step": 18
362
  },
363
  {
@@ -366,21 +366,21 @@
366
  "clip_ratio/low_mean": 0.0,
367
  "clip_ratio/low_min": 0.0,
368
  "clip_ratio/region_mean": 0.0,
369
- "completions/clipped_ratio": 0.125,
370
- "completions/max_length": 265.5,
371
- "completions/max_terminated_length": 56.5,
372
- "completions/mean_length": 97.625,
373
- "completions/mean_terminated_length": 41.66666793823242,
374
- "completions/min_length": 34.0,
375
- "completions/min_terminated_length": 34.0,
376
- "epoch": 9.8,
377
- "grad_norm": 14.597654342651367,
378
- "kl": 0.02688464312814176,
379
  "learning_rate": 3.4096741493194193e-09,
380
- "loss": 0.214,
381
- "num_tokens": 33969.0,
382
- "reward": -0.003884643316268921,
383
- "reward_std": 0.04420278873294592,
384
  "rewards/concensus_correctness_reward_func/mean": 0.0,
385
  "rewards/concensus_correctness_reward_func/std": 0.0,
386
  "rewards/consensus_reward_func/mean": 0.0,
@@ -389,30 +389,30 @@
389
  "rewards/cumulative_reward_2/std": 0.0,
390
  "rewards/final_correctness_reward_func/mean": 0.0,
391
  "rewards/final_correctness_reward_func/std": 0.0,
392
- "rewards/question_recreation_reward_func/mean": 0.020615354413166642,
393
- "rewards/question_recreation_reward_func/std": 0.019722969736903906,
394
  "rewards/soft_format_reward_func/mean": 0.0,
395
  "rewards/soft_format_reward_func/std": 0.0,
396
  "rewards/strict_format_reward_func/mean": 0.0,
397
  "rewards/strict_format_reward_func/std": 0.0,
398
- "rewards/xmlcount_reward_func/mean": -0.02449999935925007,
399
- "rewards/xmlcount_reward_func/std": 0.04899999871850014,
400
  "step": 20
401
  },
402
  {
403
- "epoch": 9.8,
404
  "step": 20,
405
  "total_flos": 0.0,
406
- "train_loss": 0.037779451161623,
407
- "train_runtime": 7169.9681,
408
- "train_samples_per_second": 0.011,
409
- "train_steps_per_second": 0.003
410
  }
411
  ],
412
  "logging_steps": 2,
413
  "max_steps": 20,
414
- "num_input_tokens_seen": 33969,
415
- "num_train_epochs": 10,
416
  "save_steps": 25,
417
  "stateful_callbacks": {
418
  "TrainerControl": {
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 19.666666666666668,
6
  "eval_steps": 500,
7
  "global_step": 20,
8
  "is_hyper_param_search": false,
 
15
  "clip_ratio/low_mean": 0.0,
16
  "clip_ratio/low_min": 0.0,
17
  "clip_ratio/region_mean": 0.0,
18
+ "completions/clipped_ratio": 0.0,
19
+ "completions/max_length": 353.5,
20
+ "completions/max_terminated_length": 353.5,
21
+ "completions/mean_length": 172.83333587646484,
22
+ "completions/mean_terminated_length": 172.83333587646484,
23
+ "completions/min_length": 56.5,
24
+ "completions/min_terminated_length": 56.5,
25
+ "epoch": 1.6666666666666665,
26
+ "grad_norm": 24.607608795166016,
27
  "kl": 0.0,
28
  "learning_rate": 5e-07,
29
+ "loss": -0.161,
30
+ "num_tokens": 3192.0,
31
+ "reward": 0.7391411811113358,
32
+ "reward_std": 0.8254430899396539,
33
+ "rewards/concensus_correctness_reward_func/mean": 0.23999999463558197,
34
+ "rewards/concensus_correctness_reward_func/std": 0.47999998927116394,
35
+ "rewards/consensus_reward_func/mean": 0.25,
36
+ "rewards/consensus_reward_func/std": 0.5,
37
  "rewards/cumulative_reward_2/mean": 0.0,
38
  "rewards/cumulative_reward_2/std": 0.0,
39
  "rewards/final_correctness_reward_func/mean": 0.0,
40
  "rewards/final_correctness_reward_func/std": 0.0,
41
+ "rewards/question_recreation_reward_func/mean": 0.08939116820693016,
42
+ "rewards/question_recreation_reward_func/std": 0.09541699755936861,
43
  "rewards/soft_format_reward_func/mean": 0.0,
44
  "rewards/soft_format_reward_func/std": 0.0,
45
  "rewards/strict_format_reward_func/mean": 0.0,
46
  "rewards/strict_format_reward_func/std": 0.0,
47
+ "rewards/xmlcount_reward_func/mean": 0.15974999964237213,
48
+ "rewards/xmlcount_reward_func/std": 0.2241138368844986,
49
  "step": 2
50
  },
51
  {
 
54
  "clip_ratio/low_mean": 0.0,
55
  "clip_ratio/low_min": 0.0,
56
  "clip_ratio/region_mean": 0.0,
57
+ "completions/clipped_ratio": 0.16666666666666669,
58
+ "completions/max_length": 346.0,
59
+ "completions/max_terminated_length": 189.5,
60
+ "completions/mean_length": 198.8333282470703,
61
+ "completions/mean_terminated_length": 141.5,
62
+ "completions/min_length": 77.0,
63
+ "completions/min_terminated_length": 77.0,
64
+ "epoch": 3.6666666666666665,
65
+ "grad_norm": 8.281935691833496,
66
+ "kl": 0.0013531837284972426,
67
  "learning_rate": 4.864543104251586e-07,
68
+ "loss": 0.0243,
69
+ "num_tokens": 6675.0,
70
+ "reward": 0.7484749890863895,
71
+ "reward_std": 0.9315725984051824,
72
+ "rewards/concensus_correctness_reward_func/mean": 0.23999999463558197,
73
+ "rewards/concensus_correctness_reward_func/std": 0.47999998927116394,
74
+ "rewards/consensus_reward_func/mean": 0.25,
75
+ "rewards/consensus_reward_func/std": 0.5,
76
  "rewards/cumulative_reward_2/mean": 0.0,
77
  "rewards/cumulative_reward_2/std": 0.0,
78
  "rewards/final_correctness_reward_func/mean": 0.0,
79
  "rewards/final_correctness_reward_func/std": 0.0,
80
+ "rewards/question_recreation_reward_func/mean": 0.040849958546459675,
81
+ "rewards/question_recreation_reward_func/std": 0.02256392315030098,
82
  "rewards/soft_format_reward_func/mean": 0.0,
83
  "rewards/soft_format_reward_func/std": 0.0,
84
  "rewards/strict_format_reward_func/mean": 0.0,
85
  "rewards/strict_format_reward_func/std": 0.0,
86
+ "rewards/xmlcount_reward_func/mean": 0.21762500703334808,
87
+ "rewards/xmlcount_reward_func/std": 0.28903544694185257,
88
  "step": 4
89
  },
90
  {
 
93
  "clip_ratio/low_mean": 0.0,
94
  "clip_ratio/low_min": 0.0,
95
  "clip_ratio/region_mean": 0.0,
96
+ "completions/clipped_ratio": 0.0,
97
+ "completions/max_length": 230.5,
98
+ "completions/max_terminated_length": 230.5,
99
+ "completions/mean_length": 104.0,
100
+ "completions/mean_terminated_length": 104.0,
101
+ "completions/min_length": 13.5,
102
+ "completions/min_terminated_length": 13.5,
103
+ "epoch": 5.666666666666667,
104
+ "grad_norm": 19.817241668701172,
105
+ "kl": 0.005833291725139134,
106
  "learning_rate": 4.472851273490984e-07,
107
+ "loss": 0.1028,
108
+ "num_tokens": 9569.0,
109
+ "reward": 0.15289875864982605,
110
+ "reward_std": 0.13462494127452374,
111
  "rewards/concensus_correctness_reward_func/mean": 0.0,
112
  "rewards/concensus_correctness_reward_func/std": 0.0,
113
  "rewards/consensus_reward_func/mean": 0.0,
 
116
  "rewards/cumulative_reward_2/std": 0.0,
117
  "rewards/final_correctness_reward_func/mean": 0.0,
118
  "rewards/final_correctness_reward_func/std": 0.0,
119
+ "rewards/question_recreation_reward_func/mean": 0.052898744121193886,
120
+ "rewards/question_recreation_reward_func/std": 0.050960212014615536,
121
  "rewards/soft_format_reward_func/mean": 0.0,
122
  "rewards/soft_format_reward_func/std": 0.0,
123
  "rewards/strict_format_reward_func/mean": 0.0,
124
  "rewards/strict_format_reward_func/std": 0.0,
125
+ "rewards/xmlcount_reward_func/mean": 0.10000000149011612,
126
+ "rewards/xmlcount_reward_func/std": 0.16105124354362488,
127
  "step": 6
128
  },
129
  {
 
132
  "clip_ratio/low_mean": 0.0,
133
  "clip_ratio/low_min": 0.0,
134
  "clip_ratio/region_mean": 0.0,
135
+ "completions/clipped_ratio": 0.0,
136
+ "completions/max_length": 214.5,
137
+ "completions/max_terminated_length": 214.5,
138
+ "completions/mean_length": 101.66666412353516,
139
+ "completions/mean_terminated_length": 101.66666412353516,
140
+ "completions/min_length": 24.0,
141
+ "completions/min_terminated_length": 24.0,
142
+ "epoch": 7.666666666666667,
143
+ "grad_norm": 16.099365234375,
144
+ "kl": 0.006465533399023116,
145
  "learning_rate": 3.867370395306068e-07,
146
+ "loss": -0.2668,
147
+ "num_tokens": 12403.0,
148
+ "reward": 0.16425441950559616,
149
+ "reward_std": 0.13405008241534233,
150
  "rewards/concensus_correctness_reward_func/mean": 0.0,
151
  "rewards/concensus_correctness_reward_func/std": 0.0,
152
  "rewards/consensus_reward_func/mean": 0.0,
 
155
  "rewards/cumulative_reward_2/std": 0.0,
156
  "rewards/final_correctness_reward_func/mean": 0.0,
157
  "rewards/final_correctness_reward_func/std": 0.0,
158
+ "rewards/question_recreation_reward_func/mean": 0.12725441344082355,
159
+ "rewards/question_recreation_reward_func/std": 0.17341968975961208,
160
  "rewards/soft_format_reward_func/mean": 0.0,
161
  "rewards/soft_format_reward_func/std": 0.0,
162
  "rewards/strict_format_reward_func/mean": 0.0,
163
  "rewards/strict_format_reward_func/std": 0.0,
164
+ "rewards/xmlcount_reward_func/mean": 0.03700000001117587,
165
+ "rewards/xmlcount_reward_func/std": 0.047584391199052334,
166
  "step": 8
167
  },
168
  {
 
172
  "clip_ratio/low_min": 0.0,
173
  "clip_ratio/region_mean": 0.0,
174
  "completions/clipped_ratio": 0.0,
175
+ "completions/max_length": 181.0,
176
+ "completions/max_terminated_length": 181.0,
177
+ "completions/mean_length": 126.33333206176758,
178
+ "completions/mean_terminated_length": 126.33333206176758,
179
+ "completions/min_length": 77.5,
180
+ "completions/min_terminated_length": 77.5,
181
+ "epoch": 9.666666666666666,
182
+ "grad_norm": 12.087945938110352,
183
+ "kl": 0.010334744409192353,
184
  "learning_rate": 3.1137137178519977e-07,
185
+ "loss": 0.0072,
186
+ "num_tokens": 15403.0,
187
+ "reward": 0.12861808016896248,
188
+ "reward_std": 0.07254930585622787,
189
  "rewards/concensus_correctness_reward_func/mean": 0.0,
190
  "rewards/concensus_correctness_reward_func/std": 0.0,
191
  "rewards/consensus_reward_func/mean": 0.0,
192
  "rewards/consensus_reward_func/std": 0.0,
193
  "rewards/cumulative_reward_2/mean": 0.0,
194
  "rewards/cumulative_reward_2/std": 0.0,
195
+ "rewards/final_correctness_reward_func/mean": 0.0,
196
+ "rewards/final_correctness_reward_func/std": 0.0,
197
+ "rewards/question_recreation_reward_func/mean": 0.05049308016896248,
198
+ "rewards/question_recreation_reward_func/std": 0.03305052034556866,
199
  "rewards/soft_format_reward_func/mean": 0.0,
200
  "rewards/soft_format_reward_func/std": 0.0,
201
  "rewards/strict_format_reward_func/mean": 0.0,
202
  "rewards/strict_format_reward_func/std": 0.0,
203
+ "rewards/xmlcount_reward_func/mean": 0.078125,
204
+ "rewards/xmlcount_reward_func/std": 0.1196383461356163,
205
  "step": 10
206
  },
207
  {
 
210
  "clip_ratio/low_mean": 0.0,
211
  "clip_ratio/low_min": 0.0,
212
  "clip_ratio/region_mean": 0.0,
213
+ "completions/clipped_ratio": 0.0,
214
+ "completions/max_length": 208.0,
215
+ "completions/max_terminated_length": 208.0,
216
+ "completions/mean_length": 105.5,
217
+ "completions/mean_terminated_length": 105.5,
218
+ "completions/min_length": 32.0,
219
+ "completions/min_terminated_length": 32.0,
220
+ "epoch": 11.666666666666666,
221
+ "grad_norm": 25.62042808532715,
222
+ "kl": 0.014036847351235338,
223
  "learning_rate": 2.2935516363191693e-07,
224
+ "loss": -0.1005,
225
+ "num_tokens": 18146.0,
226
+ "reward": 0.07443425804376602,
227
+ "reward_std": 0.025631051044911146,
228
  "rewards/concensus_correctness_reward_func/mean": 0.0,
229
  "rewards/concensus_correctness_reward_func/std": 0.0,
230
  "rewards/consensus_reward_func/mean": 0.0,
 
233
  "rewards/cumulative_reward_2/std": 0.0,
234
  "rewards/final_correctness_reward_func/mean": 0.0,
235
  "rewards/final_correctness_reward_func/std": 0.0,
236
+ "rewards/question_recreation_reward_func/mean": 0.02743426151573658,
237
+ "rewards/question_recreation_reward_func/std": 0.011736967135220766,
238
  "rewards/soft_format_reward_func/mean": 0.0,
239
  "rewards/soft_format_reward_func/std": 0.0,
240
  "rewards/strict_format_reward_func/mean": 0.0,
241
  "rewards/strict_format_reward_func/std": 0.0,
242
+ "rewards/xmlcount_reward_func/mean": 0.04700000025331974,
243
+ "rewards/xmlcount_reward_func/std": 0.06758439540863037,
244
  "step": 12
245
  },
246
  {
 
250
  "clip_ratio/low_min": 0.0,
251
  "clip_ratio/region_mean": 0.0,
252
  "completions/clipped_ratio": 0.0,
253
+ "completions/max_length": 115.0,
254
+ "completions/max_terminated_length": 115.0,
255
+ "completions/mean_length": 93.5,
256
+ "completions/mean_terminated_length": 93.5,
257
+ "completions/min_length": 69.5,
258
+ "completions/min_terminated_length": 69.5,
259
+ "epoch": 13.666666666666666,
260
+ "grad_norm": 9.609339714050293,
261
+ "kl": 0.02532817842438817,
262
  "learning_rate": 1.4957614383675767e-07,
263
+ "loss": -0.0654,
264
+ "num_tokens": 21002.0,
265
+ "reward": 0.8926658928394318,
266
+ "reward_std": 0.7070434279739857,
267
+ "rewards/concensus_correctness_reward_func/mean": 0.24050000309944153,
268
+ "rewards/concensus_correctness_reward_func/std": 0.48100000619888306,
269
+ "rewards/consensus_reward_func/mean": 0.25,
270
+ "rewards/consensus_reward_func/std": 0.5,
271
  "rewards/cumulative_reward_2/mean": 0.0,
272
  "rewards/cumulative_reward_2/std": 0.0,
273
  "rewards/final_correctness_reward_func/mean": 0.0,
274
  "rewards/final_correctness_reward_func/std": 0.0,
275
+ "rewards/question_recreation_reward_func/mean": 0.10116582456976175,
276
+ "rewards/question_recreation_reward_func/std": 0.11067926976829767,
277
  "rewards/soft_format_reward_func/mean": 0.0,
278
  "rewards/soft_format_reward_func/std": 0.0,
279
  "rewards/strict_format_reward_func/mean": 0.0,
280
  "rewards/strict_format_reward_func/std": 0.0,
281
+ "rewards/xmlcount_reward_func/mean": 0.3009999990463257,
282
+ "rewards/xmlcount_reward_func/std": 0.2542915344238281,
283
  "step": 14
284
  },
285
  {
 
288
  "clip_ratio/low_mean": 0.0,
289
  "clip_ratio/low_min": 0.0,
290
  "clip_ratio/region_mean": 0.0,
291
+ "completions/clipped_ratio": 0.16666666666666669,
292
+ "completions/max_length": 378.0,
293
+ "completions/max_terminated_length": 206.0,
294
+ "completions/mean_length": 197.33333587646484,
295
+ "completions/mean_terminated_length": 134.91666412353516,
296
+ "completions/min_length": 78.5,
297
+ "completions/min_terminated_length": 78.5,
298
+ "epoch": 15.666666666666666,
299
+ "grad_norm": 16.777217864990234,
300
+ "kl": 0.016524533144547604,
301
  "learning_rate": 8.067960709356478e-08,
302
+ "loss": 0.0725,
303
+ "num_tokens": 24589.0,
304
+ "reward": 0.14827939122915268,
305
+ "reward_std": 0.14140347205102444,
306
  "rewards/concensus_correctness_reward_func/mean": 0.0,
307
  "rewards/concensus_correctness_reward_func/std": 0.0,
308
  "rewards/consensus_reward_func/mean": 0.0,
 
311
  "rewards/cumulative_reward_2/std": 0.0,
312
  "rewards/final_correctness_reward_func/mean": 0.0,
313
  "rewards/final_correctness_reward_func/std": 0.0,
314
+ "rewards/question_recreation_reward_func/mean": 0.13127939216792583,
315
+ "rewards/question_recreation_reward_func/std": 0.19441921077668667,
316
  "rewards/soft_format_reward_func/mean": 0.0,
317
  "rewards/soft_format_reward_func/std": 0.0,
318
  "rewards/strict_format_reward_func/mean": 0.0,
319
  "rewards/strict_format_reward_func/std": 0.0,
320
+ "rewards/xmlcount_reward_func/mean": 0.017000000923871994,
321
+ "rewards/xmlcount_reward_func/std": 0.08594898879528046,
322
  "step": 16
323
  },
324
  {
 
327
  "clip_ratio/low_mean": 0.0,
328
  "clip_ratio/low_min": 0.0,
329
  "clip_ratio/region_mean": 0.0,
330
+ "completions/clipped_ratio": 0.0,
331
+ "completions/max_length": 245.0,
332
+ "completions/max_terminated_length": 245.0,
333
+ "completions/mean_length": 169.33333587646484,
334
+ "completions/mean_terminated_length": 169.33333587646484,
335
+ "completions/min_length": 98.5,
336
+ "completions/min_terminated_length": 98.5,
337
+ "epoch": 17.666666666666668,
338
+ "grad_norm": 12.084244728088379,
339
+ "kl": 0.013836602825904265,
340
  "learning_rate": 3.013156219837776e-08,
341
+ "loss": 0.1236,
342
+ "num_tokens": 27785.0,
343
+ "reward": 0.7921578735113144,
344
+ "reward_std": 1.0020692646503448,
345
+ "rewards/concensus_correctness_reward_func/mean": 0.24050000309944153,
346
+ "rewards/concensus_correctness_reward_func/std": 0.48100000619888306,
347
+ "rewards/consensus_reward_func/mean": 0.25,
348
+ "rewards/consensus_reward_func/std": 0.5,
349
  "rewards/cumulative_reward_2/mean": 0.0,
350
  "rewards/cumulative_reward_2/std": 0.0,
351
  "rewards/final_correctness_reward_func/mean": 0.0,
352
  "rewards/final_correctness_reward_func/std": 0.0,
353
+ "rewards/question_recreation_reward_func/mean": 0.12290793936699629,
354
+ "rewards/question_recreation_reward_func/std": 0.19958087475970387,
355
  "rewards/soft_format_reward_func/mean": 0.0,
356
  "rewards/soft_format_reward_func/std": 0.0,
357
  "rewards/strict_format_reward_func/mean": 0.0,
358
  "rewards/strict_format_reward_func/std": 0.0,
359
+ "rewards/xmlcount_reward_func/mean": 0.17875000089406967,
360
+ "rewards/xmlcount_reward_func/std": 0.28059088438749313,
361
  "step": 18
362
  },
363
  {
 
366
  "clip_ratio/low_mean": 0.0,
367
  "clip_ratio/low_min": 0.0,
368
  "clip_ratio/region_mean": 0.0,
369
+ "completions/clipped_ratio": 0.0,
370
+ "completions/max_length": 341.0,
371
+ "completions/max_terminated_length": 341.0,
372
+ "completions/mean_length": 187.5,
373
+ "completions/mean_terminated_length": 187.5,
374
+ "completions/min_length": 96.0,
375
+ "completions/min_terminated_length": 96.0,
376
+ "epoch": 19.666666666666668,
377
+ "grad_norm": 8.484414100646973,
378
+ "kl": 0.008221860975027084,
379
  "learning_rate": 3.4096741493194193e-09,
380
+ "loss": -0.1244,
381
+ "num_tokens": 31386.0,
382
+ "reward": 0.07486644759774208,
383
+ "reward_std": 0.07879441790282726,
384
  "rewards/concensus_correctness_reward_func/mean": 0.0,
385
  "rewards/concensus_correctness_reward_func/std": 0.0,
386
  "rewards/consensus_reward_func/mean": 0.0,
 
389
  "rewards/cumulative_reward_2/std": 0.0,
390
  "rewards/final_correctness_reward_func/mean": 0.0,
391
  "rewards/final_correctness_reward_func/std": 0.0,
392
+ "rewards/question_recreation_reward_func/mean": 0.0378664480522275,
393
+ "rewards/question_recreation_reward_func/std": 0.04724898235872388,
394
  "rewards/soft_format_reward_func/mean": 0.0,
395
  "rewards/soft_format_reward_func/std": 0.0,
396
  "rewards/strict_format_reward_func/mean": 0.0,
397
  "rewards/strict_format_reward_func/std": 0.0,
398
+ "rewards/xmlcount_reward_func/mean": 0.03700000047683716,
399
+ "rewards/xmlcount_reward_func/std": 0.07400000095367432,
400
  "step": 20
401
  },
402
  {
403
+ "epoch": 19.666666666666668,
404
  "step": 20,
405
  "total_flos": 0.0,
406
+ "train_loss": -0.03877150900661945,
407
+ "train_runtime": 4996.2526,
408
+ "train_samples_per_second": 0.016,
409
+ "train_steps_per_second": 0.004
410
  }
411
  ],
412
  "logging_steps": 2,
413
  "max_steps": 20,
414
+ "num_input_tokens_seen": 31386,
415
+ "num_train_epochs": 20,
416
  "save_steps": 25,
417
  "stateful_callbacks": {
418
  "TrainerControl": {
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ce333ccfd0eebed288b7f3195f0087f5d0abcd5c388d8f41ec2f2268933c2e63
3
  size 6929
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff8519a297979b81c6f6328bec1149b7a9ae9dac22c186355657d01cde9d0fa5
3
  size 6929