MictoNode commited on
Commit
f7d46c2
·
verified ·
1 Parent(s): cc1960f

End of training

Browse files
README.md CHANGED
@@ -8,6 +8,7 @@ tags:
8
  - grpo
9
  - gensyn
10
  - I am bipedal exotic pelican
 
11
  - trl
12
  licence: license
13
  ---
@@ -39,7 +40,7 @@ This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing
39
 
40
  - TRL: 0.15.2
41
  - Transformers: 4.51.3
42
- - Pytorch: 2.5.1
43
  - Datasets: 3.5.1
44
  - Tokenizers: 0.21.1
45
 
 
8
  - grpo
9
  - gensyn
10
  - I am bipedal exotic pelican
11
+ - unsloth
12
  - trl
13
  licence: license
14
  ---
 
40
 
41
  - TRL: 0.15.2
42
  - Transformers: 4.51.3
43
+ - Pytorch: 2.6.0
44
  - Datasets: 3.5.1
45
  - Tokenizers: 0.21.1
46
 
adapter_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Gensyn/Qwen2.5-0.5B-Instruct",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 16,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "r": 16,
24
+ "rank_pattern": {},
25
+ "revision": null,
26
+ "target_modules": [
27
+ "up_proj",
28
+ "o_proj",
29
+ "k_proj",
30
+ "v_proj",
31
+ "down_proj",
32
+ "gate_proj",
33
+ "q_proj"
34
+ ],
35
+ "task_type": "CAUSAL_LM",
36
+ "trainable_token_indices": null,
37
+ "use_dora": false,
38
+ "use_rslora": false
39
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8995ee0ce70d515dc683e56f972f42413eb9b3d3b356e6821e9f2e385d8c4d8f
3
+ size 35237104
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 29684.7394626798,
4
- "train_runtime": 550.6438,
5
- "train_samples": 140,
6
- "train_samples_per_second": 2.906,
7
- "train_steps_per_second": 0.182
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 2.47737436893658e-06,
4
+ "train_runtime": 471.1429,
5
+ "train_samples": 19,
6
+ "train_samples_per_second": 0.679,
7
+ "train_steps_per_second": 0.042
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 29684.7394626798,
4
- "train_runtime": 550.6438,
5
- "train_samples": 140,
6
- "train_samples_per_second": 2.906,
7
- "train_steps_per_second": 0.182
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 2.47737436893658e-06,
4
+ "train_runtime": 471.1429,
5
+ "train_samples": 19,
6
+ "train_samples_per_second": 0.679,
7
+ "train_steps_per_second": 0.042
8
  }
trainer_state.json CHANGED
@@ -2,977 +2,217 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 5.571428571428571,
6
  "eval_steps": 500,
7
- "global_step": 100,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "completion_length": 145.71875,
14
- "epoch": 0.11428571428571428,
15
- "grad_norm": 0.0,
16
- "kl": 0.0,
17
- "learning_rate": 1.6666666666666665e-07,
18
- "loss": -0.0,
19
- "reward": 4.5110213661100715,
20
- "reward_std": 1.5103917999658734,
21
- "rewards/concensus_correctness_reward_func": 1.70499999076128,
22
- "rewards/consensus_reward_func": 1.125,
23
  "rewards/cumulative_reward_2": 0.0,
24
  "rewards/final_correctness_reward_func": 0.0,
25
- "rewards/question_recreation_reward_func": 0.673615159932524,
26
  "rewards/soft_format_reward_func": 0.0,
27
- "rewards/strict_format_reward_func": 0.25,
28
- "rewards/xmlcount_reward_func": 0.7574062517960556,
29
  "step": 2
30
  },
31
  {
32
- "completion_length": 105.78125,
33
- "epoch": 0.22857142857142856,
34
- "grad_norm": 134.23297119140625,
35
- "kl": 0.0054768975596743985,
36
- "learning_rate": 5e-07,
37
  "loss": 0.0,
38
- "reward": 6.952304229140282,
39
- "reward_std": 0.29407966136932373,
40
- "rewards/concensus_correctness_reward_func": 2.0063750073313713,
41
- "rewards/consensus_reward_func": 1.9375,
42
  "rewards/cumulative_reward_2": 0.0,
43
- "rewards/final_correctness_reward_func": 0.375,
44
- "rewards/question_recreation_reward_func": 0.9693667776882648,
45
  "rewards/soft_format_reward_func": 0.0,
46
- "rewards/strict_format_reward_func": 0.453125,
47
- "rewards/xmlcount_reward_func": 1.2109375,
48
  "step": 4
49
  },
50
  {
51
- "completion_length": 123.59375,
52
- "epoch": 0.34285714285714286,
53
- "grad_norm": 0.009817171841859818,
54
- "kl": 0.0034752699643831875,
55
- "learning_rate": 4.994757065594279e-07,
56
  "loss": 0.0,
57
- "reward": 6.67712490260601,
58
- "reward_std": 0.0,
59
- "rewards/concensus_correctness_reward_func": 1.8021250013262033,
60
- "rewards/consensus_reward_func": 1.875,
61
  "rewards/cumulative_reward_2": 0.0,
62
- "rewards/final_correctness_reward_func": 0.25,
63
- "rewards/question_recreation_reward_func": 1.0,
64
  "rewards/soft_format_reward_func": 0.0,
65
- "rewards/strict_format_reward_func": 0.5,
66
- "rewards/xmlcount_reward_func": 1.25,
67
  "step": 6
68
  },
69
  {
70
- "completion_length": 114.40625,
71
- "epoch": 0.45714285714285713,
72
- "grad_norm": 0.01174523402005434,
73
- "kl": 0.0023470728901884286,
74
- "learning_rate": 4.979050253066063e-07,
75
  "loss": 0.0,
76
- "reward": 7.667749911546707,
77
- "reward_std": 0.0,
78
- "rewards/concensus_correctness_reward_func": 2.4177500009536743,
79
- "rewards/consensus_reward_func": 2.0,
80
  "rewards/cumulative_reward_2": 0.0,
81
- "rewards/final_correctness_reward_func": 0.5,
82
- "rewards/question_recreation_reward_func": 1.0,
83
  "rewards/soft_format_reward_func": 0.0,
84
- "rewards/strict_format_reward_func": 0.5,
85
- "rewards/xmlcount_reward_func": 1.25,
86
  "step": 8
87
  },
88
  {
89
- "completion_length": 115.40625,
90
- "epoch": 0.5714285714285714,
91
- "grad_norm": 0.005173771176487207,
92
- "kl": 0.014806151867560402,
93
- "learning_rate": 4.952945442245597e-07,
94
  "loss": 0.0,
95
- "reward": 7.168810069561005,
96
- "reward_std": 0.0,
97
- "rewards/concensus_correctness_reward_func": 2.1689999848604202,
98
- "rewards/consensus_reward_func": 2.0,
99
  "rewards/cumulative_reward_2": 0.0,
100
- "rewards/final_correctness_reward_func": 0.25,
101
- "rewards/question_recreation_reward_func": 0.9998100288212299,
102
  "rewards/soft_format_reward_func": 0.0,
103
- "rewards/strict_format_reward_func": 0.5,
104
- "rewards/xmlcount_reward_func": 1.25,
105
  "step": 10
106
  },
107
  {
108
- "completion_length": 113.9375,
109
- "epoch": 0.6857142857142857,
110
- "grad_norm": 137.42959594726562,
111
- "kl": 6.994681327629223,
112
- "learning_rate": 4.916552125781528e-07,
113
- "loss": 0.007,
114
- "reward": 7.247430235147476,
115
- "reward_std": 0.0,
116
- "rewards/concensus_correctness_reward_func": 2.2944999784231186,
117
- "rewards/consensus_reward_func": 1.875,
118
- "rewards/cumulative_reward_2": 0.0,
119
- "rewards/final_correctness_reward_func": 0.375,
120
- "rewards/question_recreation_reward_func": 0.9998052977025509,
121
- "rewards/soft_format_reward_func": 0.0,
122
- "rewards/strict_format_reward_func": 0.46875,
123
- "rewards/xmlcount_reward_func": 1.234375,
124
- "step": 12
125
- },
126
- {
127
- "completion_length": 122.0,
128
- "epoch": 0.8,
129
- "grad_norm": 1.0056698322296143,
130
- "kl": 0.013116561494825874,
131
- "learning_rate": 4.870022949890676e-07,
132
  "loss": 0.0,
133
- "reward": 6.890243321657181,
134
- "reward_std": 0.03314562886953354,
135
- "rewards/concensus_correctness_reward_func": 2.043999969959259,
136
- "rewards/consensus_reward_func": 2.0,
137
- "rewards/cumulative_reward_2": 0.0,
138
- "rewards/final_correctness_reward_func": 0.125,
139
- "rewards/question_recreation_reward_func": 0.9946808516979218,
140
- "rewards/soft_format_reward_func": 0.0,
141
- "rewards/strict_format_reward_func": 0.484375,
142
- "rewards/xmlcount_reward_func": 1.2421875,
143
- "step": 14
144
- },
145
- {
146
- "completion_length": 117.96875,
147
- "epoch": 0.9142857142857143,
148
- "grad_norm": 55.910499572753906,
149
- "kl": 0.029010365306021413,
150
- "learning_rate": 4.81355307410676e-07,
151
- "loss": 0.0,
152
- "reward": 6.665663808584213,
153
- "reward_std": 0.08780299872159958,
154
- "rewards/concensus_correctness_reward_func": 1.977749988436699,
155
- "rewards/consensus_reward_func": 1.875,
156
- "rewards/cumulative_reward_2": 0.0,
157
- "rewards/final_correctness_reward_func": 0.125,
158
- "rewards/question_recreation_reward_func": 0.9691639058291912,
159
- "rewards/soft_format_reward_func": 0.0,
160
- "rewards/strict_format_reward_func": 0.484375,
161
- "rewards/xmlcount_reward_func": 1.234375,
162
- "step": 16
163
- },
164
- {
165
- "completion_length": 116.91666666666667,
166
- "epoch": 1.0,
167
- "grad_norm": 0.018898706883192062,
168
- "kl": 0.20654772468454516,
169
- "learning_rate": 4.747379352713488e-07,
170
- "loss": 0.0002,
171
- "reward": 6.508333325386047,
172
- "reward_std": 0.0,
173
- "rewards/concensus_correctness_reward_func": 1.7583333055178325,
174
- "rewards/consensus_reward_func": 2.0,
175
  "rewards/cumulative_reward_2": 0.0,
176
  "rewards/final_correctness_reward_func": 0.0,
177
- "rewards/question_recreation_reward_func": 1.0,
178
- "rewards/soft_format_reward_func": 0.0,
179
- "rewards/strict_format_reward_func": 0.5,
180
- "rewards/xmlcount_reward_func": 1.25,
181
- "step": 18
182
- },
183
- {
184
- "completion_length": 119.65625,
185
- "epoch": 1.1142857142857143,
186
- "grad_norm": 131673767936.0,
187
- "kl": 1484174298.078927,
188
- "learning_rate": 4.6717793412953776e-07,
189
- "loss": 1484174.375,
190
- "reward": 6.440662741661072,
191
- "reward_std": 0.3215883672237396,
192
- "rewards/concensus_correctness_reward_func": 1.8583749793469906,
193
- "rewards/consensus_reward_func": 1.9375,
194
- "rewards/cumulative_reward_2": 0.0,
195
- "rewards/final_correctness_reward_func": 0.0,
196
- "rewards/question_recreation_reward_func": 0.9690064564347267,
197
- "rewards/soft_format_reward_func": 0.0,
198
- "rewards/strict_format_reward_func": 0.46875,
199
- "rewards/xmlcount_reward_func": 1.20703125,
200
- "step": 20
201
- },
202
- {
203
- "completion_length": 113.53125,
204
- "epoch": 1.2285714285714286,
205
- "grad_norm": 0.2772182524204254,
206
- "kl": 0.598538670794369,
207
- "learning_rate": 4.5870701325731773e-07,
208
- "loss": 0.0006,
209
- "reward": 7.0761168003082275,
210
- "reward_std": 0.06973974592983723,
211
- "rewards/concensus_correctness_reward_func": 2.1724999845027924,
212
- "rewards/consensus_reward_func": 1.875,
213
- "rewards/cumulative_reward_2": 0.0,
214
- "rewards/final_correctness_reward_func": 0.375,
215
- "rewards/question_recreation_reward_func": 0.9934605993330479,
216
- "rewards/soft_format_reward_func": 0.0,
217
- "rewards/strict_format_reward_func": 0.453125,
218
- "rewards/xmlcount_reward_func": 1.20703125,
219
- "step": 22
220
- },
221
- {
222
- "completion_length": 111.625,
223
- "epoch": 1.342857142857143,
224
- "grad_norm": 0.3359204828739166,
225
- "kl": 0.025825566772255115,
226
- "learning_rate": 4.4936070264068016e-07,
227
- "loss": 0.0,
228
- "reward": 7.169749945402145,
229
- "reward_std": 0.0,
230
- "rewards/concensus_correctness_reward_func": 2.169749990105629,
231
- "rewards/consensus_reward_func": 2.0,
232
- "rewards/cumulative_reward_2": 0.0,
233
- "rewards/final_correctness_reward_func": 0.25,
234
- "rewards/question_recreation_reward_func": 1.0,
235
- "rewards/soft_format_reward_func": 0.0,
236
- "rewards/strict_format_reward_func": 0.5,
237
- "rewards/xmlcount_reward_func": 1.25,
238
- "step": 24
239
- },
240
- {
241
- "completion_length": 113.6875,
242
- "epoch": 1.457142857142857,
243
- "grad_norm": 0.005876157432794571,
244
- "kl": 0.0050323337000008905,
245
- "learning_rate": 4.391782039544238e-07,
246
- "loss": 0.0,
247
- "reward": 7.012999922037125,
248
- "reward_std": 0.0,
249
- "rewards/concensus_correctness_reward_func": 2.0129999965429306,
250
- "rewards/consensus_reward_func": 2.0,
251
- "rewards/cumulative_reward_2": 0.0,
252
- "rewards/final_correctness_reward_func": 0.25,
253
- "rewards/question_recreation_reward_func": 1.0,
254
- "rewards/soft_format_reward_func": 0.0,
255
- "rewards/strict_format_reward_func": 0.5,
256
- "rewards/xmlcount_reward_func": 1.25,
257
- "step": 26
258
- },
259
- {
260
- "completion_length": 121.6875,
261
- "epoch": 1.5714285714285714,
262
- "grad_norm": 0.3083041310310364,
263
- "kl": 0.2358870167809073,
264
- "learning_rate": 4.282022261367073e-07,
265
- "loss": 0.0002,
266
- "reward": 6.6188749223947525,
267
- "reward_std": 0.11048543266952038,
268
- "rewards/concensus_correctness_reward_func": 1.946999991312623,
269
- "rewards/consensus_reward_func": 1.875,
270
- "rewards/cumulative_reward_2": 0.0,
271
- "rewards/final_correctness_reward_func": 0.0625,
272
- "rewards/question_recreation_reward_func": 1.0,
273
- "rewards/soft_format_reward_func": 0.0,
274
- "rewards/strict_format_reward_func": 0.484375,
275
- "rewards/xmlcount_reward_func": 1.25,
276
- "step": 28
277
- },
278
- {
279
- "completion_length": 117.34375,
280
- "epoch": 1.6857142857142857,
281
- "grad_norm": 0.00944291427731514,
282
- "kl": 0.005240324962869636,
283
- "learning_rate": 4.1647880625292027e-07,
284
- "loss": 0.0,
285
- "reward": 6.917999982833862,
286
- "reward_std": 0.0,
287
- "rewards/concensus_correctness_reward_func": 2.0429999604821205,
288
- "rewards/consensus_reward_func": 2.0,
289
- "rewards/cumulative_reward_2": 0.0,
290
- "rewards/final_correctness_reward_func": 0.125,
291
- "rewards/question_recreation_reward_func": 1.0,
292
- "rewards/soft_format_reward_func": 0.0,
293
- "rewards/strict_format_reward_func": 0.5,
294
- "rewards/xmlcount_reward_func": 1.25,
295
- "step": 30
296
- },
297
- {
298
- "completion_length": 118.59375,
299
- "epoch": 1.8,
300
- "grad_norm": 1.7269436120986938,
301
- "kl": 0.01583864638087107,
302
- "learning_rate": 4.040571164002318e-07,
303
- "loss": 0.0,
304
- "reward": 7.481499880552292,
305
- "reward_std": 0.0883883461356163,
306
- "rewards/concensus_correctness_reward_func": 2.419000007212162,
307
- "rewards/consensus_reward_func": 1.875,
308
- "rewards/cumulative_reward_2": 0.0,
309
- "rewards/final_correctness_reward_func": 0.4375,
310
- "rewards/question_recreation_reward_func": 1.0,
311
- "rewards/soft_format_reward_func": 0.0,
312
- "rewards/strict_format_reward_func": 0.5,
313
- "rewards/xmlcount_reward_func": 1.25,
314
- "step": 32
315
- },
316
- {
317
- "completion_length": 122.4375,
318
- "epoch": 1.9142857142857141,
319
- "grad_norm": 0.019974946975708008,
320
- "kl": 0.048992674514011014,
321
- "learning_rate": 3.909892574627266e-07,
322
- "loss": 0.0,
323
- "reward": 6.924062520265579,
324
- "reward_std": 0.12153397500514984,
325
- "rewards/concensus_correctness_reward_func": 1.9162499904632568,
326
- "rewards/consensus_reward_func": 2.0,
327
- "rewards/cumulative_reward_2": 0.0,
328
- "rewards/final_correctness_reward_func": 0.3125,
329
- "rewards/question_recreation_reward_func": 1.0,
330
- "rewards/soft_format_reward_func": 0.0,
331
- "rewards/strict_format_reward_func": 0.453125,
332
- "rewards/xmlcount_reward_func": 1.2421875,
333
- "step": 34
334
- },
335
- {
336
- "completion_length": 121.91666666666667,
337
- "epoch": 2.0,
338
- "grad_norm": 0.011273320764303207,
339
- "kl": 0.027701640533147536,
340
- "learning_rate": 3.773300405821908e-07,
341
- "loss": 0.0,
342
- "reward": 6.87549106280009,
343
- "reward_std": 0.033116184175014496,
344
- "rewards/concensus_correctness_reward_func": 1.9893333216508229,
345
- "rewards/consensus_reward_func": 2.0,
346
- "rewards/cumulative_reward_2": 0.0,
347
- "rewards/final_correctness_reward_func": 0.16666666666666666,
348
- "rewards/question_recreation_reward_func": 0.9929078022638956,
349
- "rewards/soft_format_reward_func": 0.0,
350
- "rewards/strict_format_reward_func": 0.4791666666666667,
351
- "rewards/xmlcount_reward_func": 1.2474166651566823,
352
- "step": 36
353
- },
354
- {
355
- "completion_length": 113.9375,
356
- "epoch": 2.1142857142857143,
357
- "grad_norm": 0.11432693153619766,
358
- "kl": 0.04086480487239896,
359
- "learning_rate": 3.6313675726113475e-07,
360
- "loss": 0.0,
361
- "reward": 7.168903559446335,
362
- "reward_std": 4.442277713678777e-05,
363
- "rewards/concensus_correctness_reward_func": 2.169124983251095,
364
- "rewards/consensus_reward_func": 2.0,
365
- "rewards/cumulative_reward_2": 0.0,
366
- "rewards/final_correctness_reward_func": 0.25,
367
- "rewards/question_recreation_reward_func": 0.9997786208987236,
368
  "rewards/soft_format_reward_func": 0.0,
369
- "rewards/strict_format_reward_func": 0.5,
370
- "rewards/xmlcount_reward_func": 1.25,
371
- "step": 38
372
  },
373
  {
374
- "completion_length": 116.71875,
375
- "epoch": 2.2285714285714286,
376
- "grad_norm": 0.09188953787088394,
377
- "kl": 0.012245212276866368,
378
- "learning_rate": 3.484689390623218e-07,
379
  "loss": 0.0,
380
- "reward": 7.169124960899353,
381
- "reward_std": 0.0,
382
- "rewards/concensus_correctness_reward_func": 2.169124983251095,
383
- "rewards/consensus_reward_func": 2.0,
384
- "rewards/cumulative_reward_2": 0.0,
385
- "rewards/final_correctness_reward_func": 0.25,
386
- "rewards/question_recreation_reward_func": 1.0,
387
- "rewards/soft_format_reward_func": 0.0,
388
- "rewards/strict_format_reward_func": 0.5,
389
- "rewards/xmlcount_reward_func": 1.25,
390
- "step": 40
391
- },
392
- {
393
- "completion_length": 135.125,
394
- "epoch": 2.342857142857143,
395
- "grad_norm": 3.4787302017211914,
396
- "kl": 0.593309123571089,
397
- "learning_rate": 3.3338810791270517e-07,
398
- "loss": 0.0006,
399
- "reward": 7.257124900817871,
400
- "reward_std": 0.0,
401
- "rewards/concensus_correctness_reward_func": 2.1321250051259995,
402
- "rewards/consensus_reward_func": 2.0,
403
- "rewards/cumulative_reward_2": 0.0,
404
- "rewards/final_correctness_reward_func": 0.375,
405
- "rewards/question_recreation_reward_func": 1.0,
406
- "rewards/soft_format_reward_func": 0.0,
407
- "rewards/strict_format_reward_func": 0.5,
408
- "rewards/xmlcount_reward_func": 1.25,
409
- "step": 42
410
- },
411
- {
412
- "completion_length": 121.03125,
413
- "epoch": 2.4571428571428573,
414
- "grad_norm": 1.0152610540390015,
415
- "kl": 0.24847944835892122,
416
- "learning_rate": 3.179575180590857e-07,
417
- "loss": 0.0002,
418
- "reward": 7.024739623069763,
419
- "reward_std": 0.011151571234222502,
420
- "rewards/concensus_correctness_reward_func": 2.0638749971985817,
421
- "rewards/consensus_reward_func": 2.0,
422
- "rewards/cumulative_reward_2": 0.0,
423
- "rewards/final_correctness_reward_func": 0.25,
424
- "rewards/question_recreation_reward_func": 0.9999271556735039,
425
- "rewards/soft_format_reward_func": 0.0,
426
- "rewards/strict_format_reward_func": 0.46875,
427
- "rewards/xmlcount_reward_func": 1.2421875,
428
- "step": 44
429
- },
430
- {
431
- "completion_length": 124.9375,
432
- "epoch": 2.571428571428571,
433
- "grad_norm": 7.650447845458984,
434
- "kl": 0.8674181794049218,
435
- "learning_rate": 3.022418907578188e-07,
436
- "loss": 0.0009,
437
- "reward": 6.307055249810219,
438
- "reward_std": 0.04419417306780815,
439
- "rewards/concensus_correctness_reward_func": 1.7134999874979258,
440
- "rewards/consensus_reward_func": 1.875,
441
  "rewards/cumulative_reward_2": 0.0,
442
  "rewards/final_correctness_reward_func": 0.0,
443
- "rewards/question_recreation_reward_func": 0.9998052977025509,
444
- "rewards/soft_format_reward_func": 0.0,
445
- "rewards/strict_format_reward_func": 0.46875,
446
- "rewards/xmlcount_reward_func": 1.25,
447
- "step": 46
448
- },
449
- {
450
- "completion_length": 120.9375,
451
- "epoch": 2.685714285714286,
452
- "grad_norm": 1.830073356628418,
453
- "kl": 0.1826353092528734,
454
- "learning_rate": 2.863071428113726e-07,
455
- "loss": 0.0002,
456
- "reward": 6.91051921248436,
457
- "reward_std": 0.011993602442089468,
458
- "rewards/concensus_correctness_reward_func": 2.0439999848604202,
459
- "rewards/consensus_reward_func": 2.0,
460
- "rewards/cumulative_reward_2": 0.0,
461
- "rewards/final_correctness_reward_func": 0.125,
462
- "rewards/question_recreation_reward_func": 0.9915192350745201,
463
- "rewards/soft_format_reward_func": 0.0,
464
- "rewards/strict_format_reward_func": 0.5,
465
- "rewards/xmlcount_reward_func": 1.25,
466
- "step": 48
467
- },
468
- {
469
- "completion_length": 112.4375,
470
- "epoch": 2.8,
471
- "grad_norm": 772774.3125,
472
- "kl": 61294.761235032754,
473
- "learning_rate": 2.7022011009035107e-07,
474
- "loss": 61.2948,
475
- "reward": 6.738324627280235,
476
- "reward_std": 0.09737744927406311,
477
- "rewards/concensus_correctness_reward_func": 2.064999971538782,
478
- "rewards/consensus_reward_func": 1.75,
479
- "rewards/cumulative_reward_2": 0.0,
480
- "rewards/final_correctness_reward_func": 0.25,
481
- "rewards/question_recreation_reward_func": 0.9897308498620987,
482
- "rewards/soft_format_reward_func": 0.0,
483
- "rewards/strict_format_reward_func": 0.46875,
484
- "rewards/xmlcount_reward_func": 1.21484375,
485
- "step": 50
486
- },
487
- {
488
- "completion_length": 113.0,
489
- "epoch": 2.914285714285714,
490
- "grad_norm": 0.027667926624417305,
491
- "kl": 0.02456555592652876,
492
- "learning_rate": 2.540482672006254e-07,
493
- "loss": 0.0,
494
- "reward": 7.299999892711639,
495
- "reward_std": 0.0,
496
- "rewards/concensus_correctness_reward_func": 2.1749999970197678,
497
- "rewards/consensus_reward_func": 2.0,
498
- "rewards/cumulative_reward_2": 0.0,
499
- "rewards/final_correctness_reward_func": 0.375,
500
- "rewards/question_recreation_reward_func": 1.0,
501
- "rewards/soft_format_reward_func": 0.0,
502
- "rewards/strict_format_reward_func": 0.5,
503
- "rewards/xmlcount_reward_func": 1.25,
504
- "step": 52
505
- },
506
- {
507
- "completion_length": 108.95833333333333,
508
- "epoch": 3.0,
509
- "grad_norm": 0.08859401941299438,
510
- "kl": 0.022353365551680326,
511
- "learning_rate": 2.37859444471388e-07,
512
- "loss": 0.0,
513
- "reward": 6.915333350499471,
514
- "reward_std": 0.0,
515
- "rewards/concensus_correctness_reward_func": 1.9986666440963745,
516
- "rewards/consensus_reward_func": 2.0,
517
- "rewards/cumulative_reward_2": 0.0,
518
- "rewards/final_correctness_reward_func": 0.16666666666666666,
519
- "rewards/question_recreation_reward_func": 1.0,
520
  "rewards/soft_format_reward_func": 0.0,
521
- "rewards/strict_format_reward_func": 0.5,
522
- "rewards/xmlcount_reward_func": 1.25,
523
- "step": 54
524
- },
525
- {
526
- "completion_length": 113.40625,
527
- "epoch": 3.1142857142857143,
528
- "grad_norm": 0.04827781766653061,
529
- "kl": 0.1669018538814271,
530
- "learning_rate": 2.2172154345117894e-07,
531
- "loss": 0.0002,
532
- "reward": 6.828624963760376,
533
- "reward_std": 0.0,
534
- "rewards/concensus_correctness_reward_func": 1.953624963760376,
535
- "rewards/consensus_reward_func": 2.0,
536
- "rewards/cumulative_reward_2": 0.0,
537
- "rewards/final_correctness_reward_func": 0.125,
538
- "rewards/question_recreation_reward_func": 1.0,
539
- "rewards/soft_format_reward_func": 0.0,
540
- "rewards/strict_format_reward_func": 0.5,
541
- "rewards/xmlcount_reward_func": 1.25,
542
- "step": 56
543
- },
544
- {
545
- "completion_length": 111.3125,
546
- "epoch": 3.2285714285714286,
547
- "grad_norm": 0.08809684216976166,
548
- "kl": 0.09562112473940942,
549
- "learning_rate": 2.0570225210519433e-07,
550
- "loss": 0.0001,
551
- "reward": 7.075749933719635,
552
- "reward_std": 0.02863781340420246,
553
- "rewards/concensus_correctness_reward_func": 2.0960000082850456,
554
- "rewards/consensus_reward_func": 2.0,
555
- "rewards/cumulative_reward_2": 0.0,
556
- "rewards/final_correctness_reward_func": 0.25,
557
- "rewards/question_recreation_reward_func": 1.0,
558
- "rewards/soft_format_reward_func": 0.0,
559
- "rewards/strict_format_reward_func": 0.484375,
560
- "rewards/xmlcount_reward_func": 1.2453749999403954,
561
- "step": 58
562
- },
563
- {
564
- "completion_length": 116.65625,
565
- "epoch": 3.342857142857143,
566
- "grad_norm": 5.16734504699707,
567
- "kl": 9.673246745584038,
568
- "learning_rate": 1.8986876090843664e-07,
569
- "loss": 0.0097,
570
- "reward": 6.642811328172684,
571
- "reward_std": 0.14822890423238277,
572
- "rewards/concensus_correctness_reward_func": 2.0444999784231186,
573
- "rewards/consensus_reward_func": 1.75,
574
- "rewards/cumulative_reward_2": 0.0,
575
- "rewards/final_correctness_reward_func": 0.1875,
576
- "rewards/question_recreation_reward_func": 0.9889363348484039,
577
- "rewards/soft_format_reward_func": 0.0,
578
- "rewards/strict_format_reward_func": 0.453125,
579
- "rewards/xmlcount_reward_func": 1.21875,
580
- "step": 60
581
- },
582
- {
583
- "completion_length": 114.09375,
584
- "epoch": 3.4571428571428573,
585
- "grad_norm": 8.348511695861816,
586
- "kl": 1.3997792335576378,
587
- "learning_rate": 1.7428748102551234e-07,
588
- "loss": 0.0014,
589
- "reward": 6.94431246817112,
590
- "reward_std": 0.19701763801276684,
591
- "rewards/concensus_correctness_reward_func": 2.045875007286668,
592
- "rewards/consensus_reward_func": 1.8125,
593
- "rewards/cumulative_reward_2": 0.0,
594
- "rewards/final_correctness_reward_func": 0.375,
595
- "rewards/question_recreation_reward_func": 1.0,
596
- "rewards/soft_format_reward_func": 0.0,
597
- "rewards/strict_format_reward_func": 0.46875,
598
- "rewards/xmlcount_reward_func": 1.2421875,
599
- "step": 62
600
- },
601
- {
602
- "completion_length": 118.75,
603
- "epoch": 3.571428571428571,
604
- "grad_norm": 28.351816177368164,
605
- "kl": 1.849376610138279,
606
- "learning_rate": 1.5902376575912814e-07,
607
- "loss": 0.0018,
608
- "reward": 7.588562428951263,
609
- "reward_std": 0.09943689405918121,
610
- "rewards/concensus_correctness_reward_func": 2.315124996006489,
611
- "rewards/consensus_reward_func": 2.0,
612
- "rewards/cumulative_reward_2": 0.0,
613
- "rewards/final_correctness_reward_func": 0.5625,
614
- "rewards/question_recreation_reward_func": 1.0,
615
- "rewards/soft_format_reward_func": 0.0,
616
- "rewards/strict_format_reward_func": 0.46875,
617
- "rewards/xmlcount_reward_func": 1.2421875,
618
- "step": 64
619
- },
620
- {
621
- "completion_length": 125.0,
622
- "epoch": 3.685714285714286,
623
- "grad_norm": 0.1395985186100006,
624
- "kl": 0.08357869584142463,
625
- "learning_rate": 1.4414163643562753e-07,
626
- "loss": 0.0001,
627
- "reward": 7.033588498830795,
628
- "reward_std": 0.04468413017457351,
629
- "rewards/concensus_correctness_reward_func": 2.0653749853372574,
630
- "rewards/consensus_reward_func": 2.0,
631
- "rewards/cumulative_reward_2": 0.0,
632
- "rewards/final_correctness_reward_func": 0.25,
633
- "rewards/question_recreation_reward_func": 0.999463576823473,
634
- "rewards/soft_format_reward_func": 0.0,
635
- "rewards/strict_format_reward_func": 0.484375,
636
- "rewards/xmlcount_reward_func": 1.234375,
637
- "step": 66
638
  },
639
  {
640
- "completion_length": 118.5625,
641
- "epoch": 3.8,
642
- "grad_norm": 2.3284428119659424,
643
- "kl": 0.021772082669485826,
644
- "learning_rate": 1.2970351387729872e-07,
645
  "loss": 0.0,
646
- "reward": 7.125722587108612,
647
- "reward_std": 0.060849911424156744,
648
- "rewards/concensus_correctness_reward_func": 2.168749988079071,
649
- "rewards/consensus_reward_func": 2.0,
650
- "rewards/cumulative_reward_2": 0.0,
651
- "rewards/final_correctness_reward_func": 0.25,
652
- "rewards/question_recreation_reward_func": 0.9999413713812828,
653
- "rewards/soft_format_reward_func": 0.0,
654
- "rewards/strict_format_reward_func": 0.46875,
655
- "rewards/xmlcount_reward_func": 1.23828125,
656
- "step": 68
657
- },
658
- {
659
- "completion_length": 118.40625,
660
- "epoch": 3.914285714285714,
661
- "grad_norm": 0.014703701250255108,
662
- "kl": 610.1349981201129,
663
- "learning_rate": 1.1576995658775404e-07,
664
- "loss": 0.6101,
665
- "reward": 6.650367766618729,
666
- "reward_std": 0.025721000507473946,
667
- "rewards/concensus_correctness_reward_func": 1.9187499731779099,
668
- "rewards/consensus_reward_func": 2.0,
669
  "rewards/cumulative_reward_2": 0.0,
670
  "rewards/final_correctness_reward_func": 0.0,
671
- "rewards/question_recreation_reward_func": 0.9998052977025509,
672
- "rewards/soft_format_reward_func": 0.0,
673
- "rewards/strict_format_reward_func": 0.484375,
674
- "rewards/xmlcount_reward_func": 1.2474374994635582,
675
- "step": 70
676
- },
677
- {
678
- "completion_length": 117.20833333333333,
679
- "epoch": 4.0,
680
- "grad_norm": 0.14886708557605743,
681
- "kl": 0.06357832976694529,
682
- "learning_rate": 1.0239940674851941e-07,
683
- "loss": 0.0,
684
- "reward": 6.836241086324056,
685
- "reward_std": 0.0,
686
- "rewards/concensus_correctness_reward_func": 1.9266666571299236,
687
- "rewards/consensus_reward_func": 2.0,
688
- "rewards/cumulative_reward_2": 0.0,
689
- "rewards/final_correctness_reward_func": 0.16666666666666666,
690
- "rewards/question_recreation_reward_func": 0.9929078022638956,
691
- "rewards/soft_format_reward_func": 0.0,
692
- "rewards/strict_format_reward_func": 0.5,
693
- "rewards/xmlcount_reward_func": 1.25,
694
- "step": 72
695
- },
696
- {
697
- "completion_length": 120.96875,
698
- "epoch": 4.114285714285714,
699
- "grad_norm": 2.8220157623291016,
700
- "kl": 0.02625376718060579,
701
- "learning_rate": 8.964794509221507e-08,
702
- "loss": 0.0,
703
- "reward": 6.904249906539917,
704
- "reward_std": 0.022627420723438263,
705
- "rewards/concensus_correctness_reward_func": 2.0452499985694885,
706
- "rewards/consensus_reward_func": 2.0,
707
- "rewards/cumulative_reward_2": 0.0,
708
- "rewards/final_correctness_reward_func": 0.125,
709
- "rewards/question_recreation_reward_func": 1.0,
710
- "rewards/soft_format_reward_func": 0.0,
711
- "rewards/strict_format_reward_func": 0.484375,
712
- "rewards/xmlcount_reward_func": 1.2496249973773956,
713
- "step": 74
714
- },
715
- {
716
- "completion_length": 112.25,
717
- "epoch": 4.228571428571429,
718
- "grad_norm": 13.022274017333984,
719
- "kl": 0.10997068358119577,
720
- "learning_rate": 7.756905568047392e-08,
721
- "loss": 0.0001,
722
- "reward": 6.940118342638016,
723
- "reward_std": 0.011048543266952038,
724
- "rewards/concensus_correctness_reward_func": 2.1251249834895134,
725
- "rewards/consensus_reward_func": 1.875,
726
- "rewards/cumulative_reward_2": 0.0,
727
- "rewards/final_correctness_reward_func": 0.25,
728
- "rewards/question_recreation_reward_func": 0.9946808516979218,
729
- "rewards/soft_format_reward_func": 0.0,
730
- "rewards/strict_format_reward_func": 0.46875,
731
- "rewards/xmlcount_reward_func": 1.2265625,
732
- "step": 76
733
- },
734
- {
735
- "completion_length": 116.78125,
736
- "epoch": 4.3428571428571425,
737
- "grad_norm": 0.11638541519641876,
738
- "kl": 0.03174898200086318,
739
- "learning_rate": 6.621340157319996e-08,
740
- "loss": 0.0,
741
- "reward": 7.31488761305809,
742
- "reward_std": 0.0003356575034558773,
743
- "rewards/concensus_correctness_reward_func": 2.190124988555908,
744
- "rewards/consensus_reward_func": 2.0,
745
- "rewards/cumulative_reward_2": 0.0,
746
- "rewards/final_correctness_reward_func": 0.375,
747
- "rewards/question_recreation_reward_func": 0.9997626580297947,
748
  "rewards/soft_format_reward_func": 0.0,
749
- "rewards/strict_format_reward_func": 0.5,
750
- "rewards/xmlcount_reward_func": 1.25,
751
- "step": 78
752
  },
753
  {
754
- "completion_length": 109.9375,
755
- "epoch": 4.457142857142857,
756
- "grad_norm": 0.09145835787057877,
757
- "kl": 0.014472670096438378,
758
- "learning_rate": 5.5628612330087724e-08,
759
  "loss": 0.0,
760
- "reward": 7.670249938964844,
761
- "reward_std": 0.0,
762
- "rewards/concensus_correctness_reward_func": 2.4202499985694885,
763
- "rewards/consensus_reward_func": 2.0,
764
- "rewards/cumulative_reward_2": 0.0,
765
- "rewards/final_correctness_reward_func": 0.5,
766
- "rewards/question_recreation_reward_func": 1.0,
767
- "rewards/soft_format_reward_func": 0.0,
768
- "rewards/strict_format_reward_func": 0.5,
769
- "rewards/xmlcount_reward_func": 1.25,
770
- "step": 80
771
- },
772
- {
773
- "completion_length": 130.3125,
774
- "epoch": 4.571428571428571,
775
- "grad_norm": 11.376300811767578,
776
- "kl": 0.08390317361772759,
777
- "learning_rate": 4.5859084235697235e-08,
778
- "loss": 0.0001,
779
- "reward": 6.354187414050102,
780
- "reward_std": 0.02324613742530346,
781
- "rewards/concensus_correctness_reward_func": 1.7456249836832285,
782
- "rewards/consensus_reward_func": 1.875,
783
  "rewards/cumulative_reward_2": 0.0,
784
  "rewards/final_correctness_reward_func": 0.0,
785
- "rewards/question_recreation_reward_func": 1.0,
786
  "rewards/soft_format_reward_func": 0.0,
787
- "rewards/strict_format_reward_func": 0.484375,
788
- "rewards/xmlcount_reward_func": 1.2491874992847443,
789
- "step": 82
790
- },
791
- {
792
- "completion_length": 120.53125,
793
- "epoch": 4.685714285714286,
794
- "grad_norm": 0.025102941319346428,
795
- "kl": 654.5053322186013,
796
- "learning_rate": 3.6945794086007705e-08,
797
- "loss": 0.6545,
798
- "reward": 7.0527812242507935,
799
- "reward_std": 0.10496116429567337,
800
- "rewards/concensus_correctness_reward_func": 2.0332499966025352,
801
- "rewards/consensus_reward_func": 1.875,
802
- "rewards/cumulative_reward_2": 0.0,
803
- "rewards/final_correctness_reward_func": 0.4375,
804
- "rewards/question_recreation_reward_func": 1.0,
805
- "rewards/soft_format_reward_func": 0.0,
806
- "rewards/strict_format_reward_func": 0.46875,
807
- "rewards/xmlcount_reward_func": 1.23828125,
808
- "step": 84
809
- },
810
- {
811
- "completion_length": 114.5625,
812
- "epoch": 4.8,
813
- "grad_norm": 13.666620254516602,
814
- "kl": 2.7016793186194263,
815
- "learning_rate": 2.892612731749414e-08,
816
- "loss": 0.0027,
817
- "reward": 6.9134474992752075,
818
- "reward_std": 0.0069684546906501055,
819
- "rewards/concensus_correctness_reward_func": 2.043374978005886,
820
- "rewards/consensus_reward_func": 2.0,
821
- "rewards/cumulative_reward_2": 0.0,
822
- "rewards/final_correctness_reward_func": 0.125,
823
- "rewards/question_recreation_reward_func": 0.9950725585222244,
824
- "rewards/soft_format_reward_func": 0.0,
825
- "rewards/strict_format_reward_func": 0.5,
826
- "rewards/xmlcount_reward_func": 1.25,
827
- "step": 86
828
- },
829
- {
830
- "completion_length": 114.96875,
831
- "epoch": 4.914285714285715,
832
- "grad_norm": 0.09325135499238968,
833
- "kl": 10.726173710958392,
834
- "learning_rate": 2.183372119961499e-08,
835
- "loss": 0.0107,
836
- "reward": 6.717237025499344,
837
- "reward_std": 0.2833777070045471,
838
- "rewards/concensus_correctness_reward_func": 1.9829999767243862,
839
- "rewards/consensus_reward_func": 1.9375,
840
- "rewards/cumulative_reward_2": 0.0,
841
- "rewards/final_correctness_reward_func": 0.125,
842
- "rewards/question_recreation_reward_func": 0.9764245375990868,
843
- "rewards/soft_format_reward_func": 0.0,
844
- "rewards/strict_format_reward_func": 0.484375,
845
- "rewards/xmlcount_reward_func": 1.2109375,
846
- "step": 88
847
- },
848
- {
849
- "completion_length": 113.625,
850
- "epoch": 5.0,
851
- "grad_norm": 0.05840963497757912,
852
- "kl": 0.028356703667668626,
853
- "learning_rate": 1.5698323748414122e-08,
854
- "loss": 0.0,
855
- "reward": 6.843000014623006,
856
- "reward_std": 0.0,
857
- "rewards/concensus_correctness_reward_func": 1.9263333181540172,
858
- "rewards/consensus_reward_func": 2.0,
859
- "rewards/cumulative_reward_2": 0.0,
860
- "rewards/final_correctness_reward_func": 0.16666666666666666,
861
- "rewards/question_recreation_reward_func": 1.0,
862
- "rewards/soft_format_reward_func": 0.0,
863
- "rewards/strict_format_reward_func": 0.5,
864
- "rewards/xmlcount_reward_func": 1.25,
865
- "step": 90
866
- },
867
- {
868
- "completion_length": 115.71875,
869
- "epoch": 5.114285714285714,
870
- "grad_norm": 20.699186325073242,
871
- "kl": 0.19849324076494668,
872
- "learning_rate": 1.054566895300324e-08,
873
- "loss": 0.0002,
874
- "reward": 6.53057761490345,
875
- "reward_std": 0.16040338575839996,
876
- "rewards/concensus_correctness_reward_func": 1.8652499951422215,
877
- "rewards/consensus_reward_func": 1.875,
878
- "rewards/cumulative_reward_2": 0.0,
879
- "rewards/final_correctness_reward_func": 0.125,
880
- "rewards/question_recreation_reward_func": 0.9700151830911636,
881
- "rewards/soft_format_reward_func": 0.0,
882
- "rewards/strict_format_reward_func": 0.46875,
883
- "rewards/xmlcount_reward_func": 1.2265625,
884
- "step": 92
885
- },
886
- {
887
- "completion_length": 123.6875,
888
- "epoch": 5.228571428571429,
889
- "grad_norm": 8.814971923828125,
890
- "kl": 1.0889026026070496,
891
- "learning_rate": 6.397368838268496e-09,
892
- "loss": 0.0011,
893
- "reward": 6.928559973835945,
894
- "reward_std": 0.022097086533904076,
895
- "rewards/concensus_correctness_reward_func": 2.069374980404973,
896
- "rewards/consensus_reward_func": 1.875,
897
- "rewards/cumulative_reward_2": 0.0,
898
- "rewards/final_correctness_reward_func": 0.25,
899
- "rewards/question_recreation_reward_func": 0.9998100288212299,
900
- "rewards/soft_format_reward_func": 0.0,
901
- "rewards/strict_format_reward_func": 0.484375,
902
- "rewards/xmlcount_reward_func": 1.25,
903
- "step": 94
904
- },
905
- {
906
- "completion_length": 115.28125,
907
- "epoch": 5.3428571428571425,
908
- "grad_norm": 7.976017475128174,
909
- "kl": 0.14256246562763408,
910
- "learning_rate": 3.2708228165273244e-09,
911
- "loss": 0.0001,
912
- "reward": 6.844999939203262,
913
- "reward_std": 0.0883883461356163,
914
- "rewards/concensus_correctness_reward_func": 1.9387499913573265,
915
- "rewards/consensus_reward_func": 2.0,
916
- "rewards/cumulative_reward_2": 0.0,
917
- "rewards/final_correctness_reward_func": 0.1875,
918
- "rewards/question_recreation_reward_func": 1.0,
919
- "rewards/soft_format_reward_func": 0.0,
920
- "rewards/strict_format_reward_func": 0.46875,
921
- "rewards/xmlcount_reward_func": 1.25,
922
- "step": 96
923
- },
924
- {
925
- "completion_length": 114.75,
926
- "epoch": 5.457142857142857,
927
- "grad_norm": 0.04377632215619087,
928
- "kl": 0.06355893767249654,
929
- "learning_rate": 1.1791447083465133e-09,
930
- "loss": 0.0001,
931
- "reward": 7.034624993801117,
932
- "reward_std": 0.0,
933
- "rewards/concensus_correctness_reward_func": 2.034624993801117,
934
- "rewards/consensus_reward_func": 1.875,
935
- "rewards/cumulative_reward_2": 0.0,
936
- "rewards/final_correctness_reward_func": 0.375,
937
- "rewards/question_recreation_reward_func": 1.0,
938
- "rewards/soft_format_reward_func": 0.0,
939
- "rewards/strict_format_reward_func": 0.5,
940
- "rewards/xmlcount_reward_func": 1.25,
941
- "step": 98
942
  },
943
  {
944
- "completion_length": 107.03125,
945
- "epoch": 5.571428571428571,
946
- "grad_norm": 22.148807525634766,
947
- "kl": 0.03113703287090175,
948
- "learning_rate": 1.3110773862126667e-10,
949
  "loss": 0.0,
950
- "reward": 6.888993263244629,
951
- "reward_std": 0.03314562886953354,
952
- "rewards/concensus_correctness_reward_func": 2.0427499786019325,
953
- "rewards/consensus_reward_func": 2.0,
954
  "rewards/cumulative_reward_2": 0.0,
955
- "rewards/final_correctness_reward_func": 0.125,
956
- "rewards/question_recreation_reward_func": 0.9946808516979218,
957
  "rewards/soft_format_reward_func": 0.0,
958
- "rewards/strict_format_reward_func": 0.484375,
959
- "rewards/xmlcount_reward_func": 1.2421875,
960
- "step": 100
961
  },
962
  {
963
- "epoch": 5.571428571428571,
964
- "step": 100,
965
  "total_flos": 0.0,
966
- "train_loss": 29684.7394626798,
967
- "train_runtime": 550.6438,
968
- "train_samples_per_second": 2.906,
969
- "train_steps_per_second": 0.182
970
  }
971
  ],
972
  "logging_steps": 2,
973
- "max_steps": 100,
974
  "num_input_tokens_seen": 0,
975
- "num_train_epochs": 6,
976
  "save_steps": 25,
977
  "stateful_callbacks": {
978
  "TrainerControl": {
@@ -987,7 +227,7 @@
987
  }
988
  },
989
  "total_flos": 0.0,
990
- "train_batch_size": 2,
991
  "trial_name": null,
992
  "trial_params": null
993
  }
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 4.0,
6
  "eval_steps": 500,
7
+ "global_step": 20,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "completion_length": 294.875,
14
+ "epoch": 0.42105263157894735,
15
+ "grad_norm": 7.388230323791504,
16
+ "kl": 0.0018326228964724578,
17
+ "learning_rate": 5e-07,
18
+ "loss": 0.0,
19
+ "reward": 0.23368853772990406,
20
+ "reward_std": 0.298145150532946,
21
+ "rewards/concensus_correctness_reward_func": 0.0,
22
+ "rewards/consensus_reward_func": 0.0625,
23
  "rewards/cumulative_reward_2": 0.0,
24
  "rewards/final_correctness_reward_func": 0.0,
25
+ "rewards/question_recreation_reward_func": 0.09978228050749749,
26
  "rewards/soft_format_reward_func": 0.0,
27
+ "rewards/strict_format_reward_func": 0.0,
28
+ "rewards/xmlcount_reward_func": 0.07140624802559614,
29
  "step": 2
30
  },
31
  {
32
+ "completion_length": 365.21875,
33
+ "epoch": 0.8421052631578947,
34
+ "grad_norm": 23.853408813476562,
35
+ "kl": 0.005862652775249444,
36
+ "learning_rate": 4.864543104251586e-07,
37
  "loss": 0.0,
38
+ "reward": 0.23603129759430885,
39
+ "reward_std": 0.30246378132142127,
40
+ "rewards/concensus_correctness_reward_func": 0.0,
41
+ "rewards/consensus_reward_func": 0.0,
42
  "rewards/cumulative_reward_2": 0.0,
43
+ "rewards/final_correctness_reward_func": 0.0,
44
+ "rewards/question_recreation_reward_func": 0.12512504076585174,
45
  "rewards/soft_format_reward_func": 0.0,
46
+ "rewards/strict_format_reward_func": 0.0,
47
+ "rewards/xmlcount_reward_func": 0.11090624984353781,
48
  "step": 4
49
  },
50
  {
51
+ "completion_length": 226.0,
52
+ "epoch": 1.2105263157894737,
53
+ "grad_norm": 7.582596778869629,
54
+ "kl": 0.0031690104099522743,
55
+ "learning_rate": 4.472851273490984e-07,
56
  "loss": 0.0,
57
+ "reward": 0.16822473385504313,
58
+ "reward_std": 0.5507601164281368,
59
+ "rewards/concensus_correctness_reward_func": 0.0,
60
+ "rewards/consensus_reward_func": 0.0,
61
  "rewards/cumulative_reward_2": 0.0,
62
+ "rewards/final_correctness_reward_func": 0.07142857142857142,
63
+ "rewards/question_recreation_reward_func": 0.1537961567352925,
64
  "rewards/soft_format_reward_func": 0.0,
65
+ "rewards/strict_format_reward_func": 0.0,
66
+ "rewards/xmlcount_reward_func": -0.05699999790106501,
67
  "step": 6
68
  },
69
  {
70
+ "completion_length": 385.09375,
71
+ "epoch": 1.631578947368421,
72
+ "grad_norm": 2.9946677684783936,
73
+ "kl": 0.001347492725471966,
74
+ "learning_rate": 3.867370395306068e-07,
75
  "loss": 0.0,
76
+ "reward": 0.21872268104925752,
77
+ "reward_std": 0.6904346485389397,
78
+ "rewards/concensus_correctness_reward_func": 0.0,
79
+ "rewards/consensus_reward_func": 0.125,
80
  "rewards/cumulative_reward_2": 0.0,
81
+ "rewards/final_correctness_reward_func": 0.0,
82
+ "rewards/question_recreation_reward_func": 0.2022851686924696,
83
  "rewards/soft_format_reward_func": 0.0,
84
+ "rewards/strict_format_reward_func": 0.0,
85
+ "rewards/xmlcount_reward_func": -0.10856249555945396,
86
  "step": 8
87
  },
88
  {
89
+ "completion_length": 446.85714285714283,
90
+ "epoch": 2.0,
91
+ "grad_norm": 0.9212186336517334,
92
+ "kl": 0.0022978045121167918,
93
+ "learning_rate": 3.1137137178519977e-07,
94
  "loss": 0.0,
95
+ "reward": 0.13073402217456273,
96
+ "reward_std": 0.3137416091880628,
97
+ "rewards/concensus_correctness_reward_func": 0.0,
98
+ "rewards/consensus_reward_func": 0.0,
99
  "rewards/cumulative_reward_2": 0.0,
100
+ "rewards/final_correctness_reward_func": 0.0,
101
+ "rewards/question_recreation_reward_func": 0.13244829353477275,
102
  "rewards/soft_format_reward_func": 0.0,
103
+ "rewards/strict_format_reward_func": 0.0,
104
+ "rewards/xmlcount_reward_func": -0.001714284930910383,
105
  "step": 10
106
  },
107
  {
108
+ "completion_length": 368.5625,
109
+ "epoch": 2.4210526315789473,
110
+ "grad_norm": 1.7672561407089233,
111
+ "kl": 0.0013554280303651467,
112
+ "learning_rate": 2.2935516363191693e-07,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  "loss": 0.0,
114
+ "reward": -0.1324465568177402,
115
+ "reward_std": 0.6439249363029376,
116
+ "rewards/concensus_correctness_reward_func": 0.0,
117
+ "rewards/consensus_reward_func": 0.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  "rewards/cumulative_reward_2": 0.0,
119
  "rewards/final_correctness_reward_func": 0.0,
120
+ "rewards/question_recreation_reward_func": 0.09061592211946845,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  "rewards/soft_format_reward_func": 0.0,
122
+ "rewards/strict_format_reward_func": 0.0,
123
+ "rewards/xmlcount_reward_func": -0.22306250617839396,
124
+ "step": 12
125
  },
126
  {
127
+ "completion_length": 217.71875,
128
+ "epoch": 2.8421052631578947,
129
+ "grad_norm": 51.95741653442383,
130
+ "kl": 0.0025395552984264214,
131
+ "learning_rate": 1.4957614383675767e-07,
132
  "loss": 0.0,
133
+ "reward": 0.18613064312376082,
134
+ "reward_std": 0.20095090114045888,
135
+ "rewards/concensus_correctness_reward_func": 0.0013749999925494194,
136
+ "rewards/consensus_reward_func": 0.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  "rewards/cumulative_reward_2": 0.0,
138
  "rewards/final_correctness_reward_func": 0.0,
139
+ "rewards/question_recreation_reward_func": 0.1167868955526501,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  "rewards/soft_format_reward_func": 0.0,
141
+ "rewards/strict_format_reward_func": 0.0,
142
+ "rewards/xmlcount_reward_func": 0.06796875037252903,
143
+ "step": 14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  },
145
  {
146
+ "completion_length": 276.2142857142857,
147
+ "epoch": 3.2105263157894735,
148
+ "grad_norm": 29.23552131652832,
149
+ "kl": 0.0017278649694552378,
150
+ "learning_rate": 8.067960709356478e-08,
151
  "loss": 0.0,
152
+ "reward": 0.2764262727328709,
153
+ "reward_std": 0.45395882214818684,
154
+ "rewards/concensus_correctness_reward_func": 0.0,
155
+ "rewards/consensus_reward_func": 0.07142857142857142,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  "rewards/cumulative_reward_2": 0.0,
157
  "rewards/final_correctness_reward_func": 0.0,
158
+ "rewards/question_recreation_reward_func": 0.16878341936639377,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  "rewards/soft_format_reward_func": 0.0,
160
+ "rewards/strict_format_reward_func": 0.0,
161
+ "rewards/xmlcount_reward_func": 0.03621428566319602,
162
+ "step": 16
163
  },
164
  {
165
+ "completion_length": 319.46875,
166
+ "epoch": 3.6315789473684212,
167
+ "grad_norm": 52.476932525634766,
168
+ "kl": 0.004187825550616253,
169
+ "learning_rate": 3.013156219837776e-08,
170
  "loss": 0.0,
171
+ "reward": 0.058828551205806434,
172
+ "reward_std": 0.11632925440790132,
173
+ "rewards/concensus_correctness_reward_func": 0.0,
174
+ "rewards/consensus_reward_func": 0.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  "rewards/cumulative_reward_2": 0.0,
176
  "rewards/final_correctness_reward_func": 0.0,
177
+ "rewards/question_recreation_reward_func": 0.0568285504123196,
178
  "rewards/soft_format_reward_func": 0.0,
179
+ "rewards/strict_format_reward_func": 0.0,
180
+ "rewards/xmlcount_reward_func": 0.0020000003278255463,
181
+ "step": 18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  },
183
  {
184
+ "completion_length": 451.92857142857144,
185
+ "epoch": 4.0,
186
+ "grad_norm": 6.4608612060546875,
187
+ "kl": 0.0014391274640469679,
188
+ "learning_rate": 3.4096741493194193e-09,
189
  "loss": 0.0,
190
+ "reward": 0.09713409947497505,
191
+ "reward_std": 0.34493599299873623,
192
+ "rewards/concensus_correctness_reward_func": 0.0,
193
+ "rewards/consensus_reward_func": 0.0,
194
  "rewards/cumulative_reward_2": 0.0,
195
+ "rewards/final_correctness_reward_func": 0.07142857142857142,
196
+ "rewards/question_recreation_reward_func": 0.1079555196421487,
197
  "rewards/soft_format_reward_func": 0.0,
198
+ "rewards/strict_format_reward_func": 0.0,
199
+ "rewards/xmlcount_reward_func": -0.08224999904632568,
200
+ "step": 20
201
  },
202
  {
203
+ "epoch": 4.0,
204
+ "step": 20,
205
  "total_flos": 0.0,
206
+ "train_loss": 2.47737436893658e-06,
207
+ "train_runtime": 471.1429,
208
+ "train_samples_per_second": 0.679,
209
+ "train_steps_per_second": 0.042
210
  }
211
  ],
212
  "logging_steps": 2,
213
+ "max_steps": 20,
214
  "num_input_tokens_seen": 0,
215
+ "num_train_epochs": 5,
216
  "save_steps": 25,
217
  "stateful_callbacks": {
218
  "TrainerControl": {
 
227
  }
228
  },
229
  "total_flos": 0.0,
230
+ "train_batch_size": 4,
231
  "trial_name": null,
232
  "trial_params": null
233
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b25d9cecb022bd8466bc95a28810ec4cc1b87989edb78a200a25872bbeda8aee
3
- size 5944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:590cd41cfd14a5b1b8e84d269c5bcc6f22c8809140576b81ac98d0782e5a6e5e
3
+ size 6008