MAGICYA0 commited on
Commit
be1634b
·
verified ·
1 Parent(s): 9d0fa7e

End of training

Browse files
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- base_model: Gensyn/Qwen2.5-0.5B-Instruct
3
  library_name: transformers
4
  model_name: Qwen2.5-0.5B-Instruct-Gensyn-Swarm-bold_rugged_camel
5
  tags:
@@ -8,14 +8,13 @@ tags:
8
  - grpo
9
  - gensyn
10
  - I am bold rugged camel
11
- - unsloth
12
  - trl
13
  licence: license
14
  ---
15
 
16
  # Model Card for Qwen2.5-0.5B-Instruct-Gensyn-Swarm-bold_rugged_camel
17
 
18
- This model is a fine-tuned version of [Gensyn/Qwen2.5-0.5B-Instruct](https://huggingface.co/Gensyn/Qwen2.5-0.5B-Instruct).
19
  It has been trained using [TRL](https://github.com/huggingface/trl).
20
 
21
  ## Quick start
@@ -38,10 +37,10 @@ This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing
38
 
39
  ### Framework versions
40
 
41
- - TRL: 0.15.2
42
- - Transformers: 4.48.2
43
- - Pytorch: 2.5.1
44
- - Datasets: 3.6.0
45
  - Tokenizers: 0.21.1
46
 
47
  ## Citations
@@ -63,7 +62,7 @@ Cite TRL as:
63
  ```bibtex
64
  @misc{vonwerra2022trl,
65
  title = {{TRL: Transformer Reinforcement Learning}},
66
- author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
67
  year = 2020,
68
  journal = {GitHub repository},
69
  publisher = {GitHub},
 
1
  ---
2
+ base_model: unsloth/Qwen2.5-0.5B-Instruct
3
  library_name: transformers
4
  model_name: Qwen2.5-0.5B-Instruct-Gensyn-Swarm-bold_rugged_camel
5
  tags:
 
8
  - grpo
9
  - gensyn
10
  - I am bold rugged camel
 
11
  - trl
12
  licence: license
13
  ---
14
 
15
  # Model Card for Qwen2.5-0.5B-Instruct-Gensyn-Swarm-bold_rugged_camel
16
 
17
+ This model is a fine-tuned version of [unsloth/Qwen2.5-0.5B-Instruct](https://huggingface.co/unsloth/Qwen2.5-0.5B-Instruct).
18
  It has been trained using [TRL](https://github.com/huggingface/trl).
19
 
20
  ## Quick start
 
37
 
38
  ### Framework versions
39
 
40
+ - TRL: 0.17.0
41
+ - Transformers: 4.51.3
42
+ - Pytorch: 2.7.0
43
+ - Datasets: 3.5.1
44
  - Tokenizers: 0.21.1
45
 
46
  ## Citations
 
62
  ```bibtex
63
  @misc{vonwerra2022trl,
64
  title = {{TRL: Transformer Reinforcement Learning}},
65
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
66
  year = 2020,
67
  journal = {GitHub repository},
68
  publisher = {GitHub},
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.17895936399698256,
4
- "train_runtime": 3065.998,
5
- "train_samples": 122,
6
- "train_samples_per_second": 4.175,
7
- "train_steps_per_second": 0.033
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": -0.006354737589201066,
4
+ "train_runtime": 2680.4196,
5
+ "train_samples": 12,
6
+ "train_samples_per_second": 0.149,
7
+ "train_steps_per_second": 0.019
8
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:69272824f3e9ad2ec3a1b9cc52db3e0ce113682ce4278ca3cde44e534a1badd7
3
  size 1976163472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:007406f7c32c1b25bc4351a4ecf89b40d66feed1715992e34ef5188c158b768e
3
  size 1976163472
special_tokens_map.json CHANGED
@@ -22,7 +22,7 @@
22
  "single_word": false
23
  },
24
  "pad_token": {
25
- "content": "<|endoftext|>",
26
  "lstrip": false,
27
  "normalized": false,
28
  "rstrip": false,
 
22
  "single_word": false
23
  },
24
  "pad_token": {
25
+ "content": "<|vision_pad|>",
26
  "lstrip": false,
27
  "normalized": false,
28
  "rstrip": false,
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5eee858c5123a4279c3e1f7b81247343f356ac767940b2692a928ad929543214
3
- size 11422063
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64e71213db910f5cafa86d35091f37393dcc344b1bbc34091d1b3eed4cca01d5
3
+ size 11422064
tokenizer_config.json CHANGED
@@ -200,8 +200,9 @@
200
  "eos_token": "<|im_end|>",
201
  "errors": "replace",
202
  "extra_special_tokens": {},
203
- "model_max_length": 131072,
204
- "pad_token": "<|endoftext|>",
 
205
  "split_special_tokens": false,
206
  "tokenizer_class": "Qwen2Tokenizer",
207
  "unk_token": null
 
200
  "eos_token": "<|im_end|>",
201
  "errors": "replace",
202
  "extra_special_tokens": {},
203
+ "model_max_length": 32768,
204
+ "pad_token": "<|vision_pad|>",
205
+ "padding_side": "left",
206
  "split_special_tokens": false,
207
  "tokenizer_class": "Qwen2Tokenizer",
208
  "unk_token": null
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.17895936399698256,
4
- "train_runtime": 3065.998,
5
- "train_samples": 122,
6
- "train_samples_per_second": 4.175,
7
- "train_steps_per_second": 0.033
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": -0.006354737589201066,
4
+ "train_runtime": 2680.4196,
5
+ "train_samples": 12,
6
+ "train_samples_per_second": 0.149,
7
+ "train_steps_per_second": 0.019
8
  }
trainer_state.json CHANGED
@@ -1,408 +1,1004 @@
1
  {
 
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 14.262295081967213,
5
  "eval_steps": 500,
6
- "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "completion_length": 100.3828125,
13
- "epoch": 0.6557377049180327,
14
- "grad_norm": 18.24576187133789,
15
- "kl": 21.25354176312685,
16
- "learning_rate": 5e-07,
17
- "loss": 0.0213,
18
- "reward": 5.629883229732513,
19
- "reward_std": 0.9574323672801256,
20
- "rewards/concensus_correctness_reward_func": 2.0075625002384188,
21
- "rewards/consensus_reward_func": 1.89375,
22
- "rewards/cumulative_reward_2": 0.0,
23
- "rewards/final_correctness_reward_func": 0.1,
24
- "rewards/question_recreation_reward_func": 0.18748326459899545,
25
- "rewards/soft_format_reward_func": 0.0,
26
- "rewards/strict_format_reward_func": 0.33671875,
27
- "rewards/xmlcount_reward_func": 1.1043687596917153,
28
- "step": 5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  },
30
  {
31
- "completion_length": 107.58333333333333,
32
- "epoch": 1.3934426229508197,
33
- "grad_norm": 148.78001403808594,
34
- "kl": 98.14546892188844,
35
- "learning_rate": 4.965903258506806e-07,
36
- "loss": 0.1031,
37
- "reward": 5.229624305452619,
38
- "reward_std": 1.4278448562004737,
39
- "rewards/concensus_correctness_reward_func": 1.8250654481706166,
40
- "rewards/consensus_reward_func": 1.7857142857142858,
41
- "rewards/cumulative_reward_2": 0.0,
42
- "rewards/final_correctness_reward_func": 0.03869047619047619,
43
- "rewards/question_recreation_reward_func": 0.21489370844903447,
44
- "rewards/soft_format_reward_func": 0.0,
45
- "rewards/strict_format_reward_func": 0.3162202380952381,
46
- "rewards/xmlcount_reward_func": 1.0490401770387376,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  "step": 10
48
  },
49
  {
50
- "completion_length": 104.20238095238095,
51
- "epoch": 2.1311475409836067,
52
- "grad_norm": 25.345481872558594,
53
- "kl": 20.04661506130582,
54
- "learning_rate": 4.864543104251586e-07,
55
- "loss": 0.021,
56
- "reward": 5.364634025664556,
57
- "reward_std": 1.250795535388447,
58
- "rewards/concensus_correctness_reward_func": 1.8868392705917358,
59
- "rewards/consensus_reward_func": 1.8333333333333333,
60
- "rewards/cumulative_reward_2": 0.0,
61
- "rewards/final_correctness_reward_func": 0.0744047619047619,
62
- "rewards/question_recreation_reward_func": 0.18857152263323465,
63
- "rewards/soft_format_reward_func": 0.0,
64
- "rewards/strict_format_reward_func": 0.31845238095238093,
65
- "rewards/xmlcount_reward_func": 1.0630327349617368,
66
- "step": 15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  },
68
  {
69
- "completion_length": 102.1984375,
70
- "epoch": 2.7868852459016393,
71
- "grad_norm": 77.89488983154297,
72
- "kl": 30.944871386885644,
73
- "learning_rate": 4.698684378016222e-07,
74
- "loss": 0.0309,
75
- "reward": 5.653480577468872,
76
- "reward_std": 1.2769613616168498,
77
- "rewards/concensus_correctness_reward_func": 2.0845031082630157,
78
- "rewards/consensus_reward_func": 1.846875,
79
- "rewards/cumulative_reward_2": 0.0,
80
- "rewards/final_correctness_reward_func": 0.096875,
81
- "rewards/question_recreation_reward_func": 0.192813384719193,
82
- "rewards/soft_format_reward_func": 0.0,
83
- "rewards/strict_format_reward_func": 0.3375,
84
- "rewards/xmlcount_reward_func": 1.0949140563607216,
85
- "step": 20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  },
87
  {
88
- "completion_length": 103.16220238095238,
89
- "epoch": 3.5245901639344264,
90
- "grad_norm": 24.907499313354492,
91
- "kl": 230.05476773069017,
92
- "learning_rate": 4.472851273490984e-07,
93
- "loss": 0.2416,
94
- "reward": 5.325274070103963,
95
- "reward_std": 1.2981156784863699,
96
- "rewards/concensus_correctness_reward_func": 1.8906339208285015,
97
- "rewards/consensus_reward_func": 1.8273809523809523,
98
- "rewards/cumulative_reward_2": 0.0,
99
- "rewards/final_correctness_reward_func": 0.03869047619047619,
100
- "rewards/question_recreation_reward_func": 0.18287525122009574,
101
- "rewards/soft_format_reward_func": 0.0,
102
- "rewards/strict_format_reward_func": 0.31919642857142855,
103
- "rewards/xmlcount_reward_func": 1.066497029293151,
104
- "step": 25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  },
106
  {
107
- "completion_length": 103.94345238095238,
108
- "epoch": 4.262295081967213,
109
- "grad_norm": 394.05816650390625,
110
- "kl": 61.65677780338696,
111
- "learning_rate": 4.193203929064353e-07,
112
- "loss": 0.0647,
113
- "reward": 5.575706357047672,
114
- "reward_std": 1.164041012170769,
115
- "rewards/concensus_correctness_reward_func": 2.0769047424906777,
116
- "rewards/consensus_reward_func": 1.8482142857142858,
117
- "rewards/cumulative_reward_2": 0.0,
118
- "rewards/final_correctness_reward_func": 0.03869047619047619,
119
- "rewards/question_recreation_reward_func": 0.1917480392647641,
120
- "rewards/soft_format_reward_func": 0.0,
121
- "rewards/strict_format_reward_func": 0.33630952380952384,
122
- "rewards/xmlcount_reward_func": 1.083839285941351,
123
- "step": 30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  },
125
  {
126
- "completion_length": 99.4546875,
127
- "epoch": 4.918032786885246,
128
- "grad_norm": 20.789705276489258,
129
- "kl": 22.802349741756917,
130
- "learning_rate": 3.867370395306068e-07,
131
- "loss": 0.0228,
132
- "reward": 5.526724541187287,
133
- "reward_std": 1.3097478641197084,
134
- "rewards/concensus_correctness_reward_func": 2.0230187296867372,
135
- "rewards/consensus_reward_func": 1.825,
136
- "rewards/cumulative_reward_2": 0.0,
137
- "rewards/final_correctness_reward_func": 0.071875,
138
- "rewards/question_recreation_reward_func": 0.19072145251557232,
139
- "rewards/soft_format_reward_func": 0.0,
140
- "rewards/strict_format_reward_func": 0.33125,
141
- "rewards/xmlcount_reward_func": 1.0848593652248382,
142
- "step": 35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  },
144
  {
145
- "completion_length": 103.71577380952381,
146
- "epoch": 5.655737704918033,
147
- "grad_norm": 45803.5234375,
148
- "kl": 1866.460280713581,
149
- "learning_rate": 3.5042385616324236e-07,
150
- "loss": 1.9598,
151
- "reward": 5.331885190237136,
152
- "reward_std": 1.2931899396436555,
153
- "rewards/concensus_correctness_reward_func": 1.847133906114669,
154
- "rewards/consensus_reward_func": 1.8035714285714286,
155
- "rewards/cumulative_reward_2": 0.0,
156
- "rewards/final_correctness_reward_func": 0.05654761904761905,
157
- "rewards/question_recreation_reward_func": 0.20447895285629092,
158
- "rewards/soft_format_reward_func": 0.0,
159
- "rewards/strict_format_reward_func": 0.33482142857142855,
160
- "rewards/xmlcount_reward_func": 1.085331841593697,
161
- "step": 40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  },
163
  {
164
- "completion_length": 101.29017857142857,
165
- "epoch": 6.39344262295082,
166
- "grad_norm": 244.06797790527344,
167
- "kl": 121.59677911514328,
168
- "learning_rate": 3.1137137178519977e-07,
169
- "loss": 0.1277,
170
- "reward": 5.363064402625675,
171
- "reward_std": 1.220301142671988,
172
- "rewards/concensus_correctness_reward_func": 1.9234166599455333,
173
- "rewards/consensus_reward_func": 1.8244047619047619,
174
- "rewards/cumulative_reward_2": 0.0,
175
- "rewards/final_correctness_reward_func": 0.03273809523809524,
176
- "rewards/question_recreation_reward_func": 0.18449295338775432,
177
- "rewards/soft_format_reward_func": 0.0,
178
- "rewards/strict_format_reward_func": 0.32068452380952384,
179
- "rewards/xmlcount_reward_func": 1.0773273834160395,
180
- "step": 45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  },
182
  {
183
- "completion_length": 105.33779761904762,
184
- "epoch": 7.131147540983607,
185
- "grad_norm": 36.396759033203125,
186
- "kl": 139.66608831428346,
187
- "learning_rate": 2.706448363680831e-07,
188
- "loss": 0.1467,
189
- "reward": 5.507648865381877,
190
- "reward_std": 1.35185175743841,
191
- "rewards/concensus_correctness_reward_func": 2.0074226146652583,
192
- "rewards/consensus_reward_func": 1.8154761904761905,
193
- "rewards/cumulative_reward_2": 0.0,
194
- "rewards/final_correctness_reward_func": 0.08928571428571429,
195
- "rewards/question_recreation_reward_func": 0.19738249081586087,
196
- "rewards/soft_format_reward_func": 0.0,
197
- "rewards/strict_format_reward_func": 0.33407738095238093,
198
- "rewards/xmlcount_reward_func": 1.0640044609705608,
199
- "step": 50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  },
201
  {
202
- "completion_length": 103.2515625,
203
- "epoch": 7.786885245901639,
204
- "grad_norm": 828.4172973632812,
205
- "kl": 65.73979671299458,
206
- "learning_rate": 2.2935516363191693e-07,
207
- "loss": 0.0657,
208
- "reward": 5.565059649944305,
209
- "reward_std": 1.3059047222137452,
210
- "rewards/concensus_correctness_reward_func": 2.053659364581108,
211
- "rewards/consensus_reward_func": 1.8375,
212
- "rewards/cumulative_reward_2": 0.0,
213
- "rewards/final_correctness_reward_func": 0.071875,
214
- "rewards/question_recreation_reward_func": 0.1800190585665405,
215
- "rewards/soft_format_reward_func": 0.0,
216
- "rewards/strict_format_reward_func": 0.34140625,
217
- "rewards/xmlcount_reward_func": 1.0805999979376792,
218
- "step": 55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  },
220
  {
221
- "completion_length": 101.42261904761905,
222
- "epoch": 8.524590163934427,
223
- "grad_norm": 276.790771484375,
224
- "kl": 38.72009684358324,
225
- "learning_rate": 1.886286282148002e-07,
226
- "loss": 0.0407,
227
- "reward": 5.529350417000907,
228
- "reward_std": 1.1430614941886492,
229
- "rewards/concensus_correctness_reward_func": 1.9908987851369948,
230
- "rewards/consensus_reward_func": 1.8392857142857142,
231
- "rewards/cumulative_reward_2": 0.0,
232
- "rewards/final_correctness_reward_func": 0.0625,
233
- "rewards/question_recreation_reward_func": 0.20330874985527425,
234
- "rewards/soft_format_reward_func": 0.0,
235
- "rewards/strict_format_reward_func": 0.33035714285714285,
236
- "rewards/xmlcount_reward_func": 1.1030000008287884,
237
- "step": 60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  },
239
  {
240
- "completion_length": 103.26190476190476,
241
- "epoch": 9.262295081967213,
242
- "grad_norm": 151.35389709472656,
243
- "kl": 71.02663791747321,
244
- "learning_rate": 1.4957614383675767e-07,
245
- "loss": 0.0746,
246
- "reward": 5.519129571460542,
247
- "reward_std": 1.3336647582196055,
248
- "rewards/concensus_correctness_reward_func": 2.0491041512716386,
249
- "rewards/consensus_reward_func": 1.8154761904761905,
250
- "rewards/cumulative_reward_2": 0.0,
251
- "rewards/final_correctness_reward_func": 0.08928571428571429,
252
- "rewards/question_recreation_reward_func": 0.17715932703798726,
253
- "rewards/soft_format_reward_func": 0.0,
254
- "rewards/strict_format_reward_func": 0.32142857142857145,
255
- "rewards/xmlcount_reward_func": 1.0666756048088981,
256
- "step": 65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
  },
258
  {
259
- "completion_length": 104.4890625,
260
- "epoch": 9.918032786885245,
261
- "grad_norm": 83.57910919189453,
262
- "kl": 280.223268455267,
263
- "learning_rate": 1.1326296046939333e-07,
264
- "loss": 0.2802,
265
- "reward": 5.418901389837265,
266
- "reward_std": 1.4045355953276157,
267
- "rewards/concensus_correctness_reward_func": 1.9587249845266341,
268
- "rewards/consensus_reward_func": 1.790625,
269
- "rewards/cumulative_reward_2": 0.0,
270
- "rewards/final_correctness_reward_func": 0.075,
271
- "rewards/question_recreation_reward_func": 0.19451857786625623,
272
- "rewards/soft_format_reward_func": 0.0,
273
- "rewards/strict_format_reward_func": 0.328125,
274
- "rewards/xmlcount_reward_func": 1.0719077974557876,
275
- "step": 70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
  },
277
  {
278
- "completion_length": 102.25297619047619,
279
- "epoch": 10.655737704918034,
280
- "grad_norm": 14.267416954040527,
281
- "kl": 29.046702690067747,
282
- "learning_rate": 8.067960709356478e-08,
283
- "loss": 0.0305,
284
- "reward": 5.528152045749483,
285
- "reward_std": 1.3876310292453993,
286
- "rewards/concensus_correctness_reward_func": 2.0544732014338174,
287
- "rewards/consensus_reward_func": 1.8035714285714286,
288
- "rewards/cumulative_reward_2": 0.0,
289
- "rewards/final_correctness_reward_func": 0.07142857142857142,
290
- "rewards/question_recreation_reward_func": 0.1875746830233506,
291
- "rewards/soft_format_reward_func": 0.0,
292
- "rewards/strict_format_reward_func": 0.33556547619047616,
293
- "rewards/xmlcount_reward_func": 1.075538693439393,
294
- "step": 75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
  },
296
  {
297
- "completion_length": 102.58630952380952,
298
- "epoch": 11.39344262295082,
299
- "grad_norm": 31.220264434814453,
300
- "kl": 134.90475826036362,
301
- "learning_rate": 5.271487265090163e-08,
302
- "loss": 0.1416,
303
- "reward": 5.555762756438482,
304
- "reward_std": 1.4382377146255403,
305
- "rewards/concensus_correctness_reward_func": 2.0338958217984153,
306
- "rewards/consensus_reward_func": 1.8154761904761905,
307
- "rewards/cumulative_reward_2": 0.0,
308
- "rewards/final_correctness_reward_func": 0.08035714285714286,
309
- "rewards/question_recreation_reward_func": 0.19962290736536184,
310
- "rewards/soft_format_reward_func": 0.0,
311
- "rewards/strict_format_reward_func": 0.3444940476190476,
312
- "rewards/xmlcount_reward_func": 1.0819166629087358,
313
- "step": 80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
  },
315
  {
316
- "completion_length": 102.8764880952381,
317
- "epoch": 12.131147540983607,
318
- "grad_norm": 59.1387939453125,
319
- "kl": 87.94486146313804,
320
- "learning_rate": 3.013156219837776e-08,
321
- "loss": 0.0923,
322
- "reward": 5.488595451627459,
323
- "reward_std": 1.343421846628189,
324
- "rewards/concensus_correctness_reward_func": 2.009145816167196,
325
- "rewards/consensus_reward_func": 1.8154761904761905,
326
- "rewards/cumulative_reward_2": 0.0,
327
- "rewards/final_correctness_reward_func": 0.07142857142857142,
328
- "rewards/question_recreation_reward_func": 0.18893775813991115,
329
- "rewards/soft_format_reward_func": 0.0,
330
- "rewards/strict_format_reward_func": 0.3288690476190476,
331
- "rewards/xmlcount_reward_func": 1.07473809946151,
332
- "step": 85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
  },
334
  {
335
- "completion_length": 98.6375,
336
- "epoch": 12.78688524590164,
337
- "grad_norm": 74.56422424316406,
338
- "kl": 35.45555876493454,
339
- "learning_rate": 1.3545689574841341e-08,
340
- "loss": 0.0355,
341
- "reward": 5.770984804630279,
342
- "reward_std": 1.1192259122617543,
343
- "rewards/concensus_correctness_reward_func": 2.1744687259197235,
344
- "rewards/consensus_reward_func": 1.86875,
345
- "rewards/cumulative_reward_2": 0.0,
346
- "rewards/final_correctness_reward_func": 0.078125,
347
- "rewards/question_recreation_reward_func": 0.19403170635923744,
348
- "rewards/soft_format_reward_func": 0.0,
349
- "rewards/strict_format_reward_func": 0.3484375,
350
- "rewards/xmlcount_reward_func": 1.1071718871593474,
351
- "step": 90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
  },
353
  {
354
- "completion_length": 103.7514880952381,
355
- "epoch": 13.524590163934427,
356
- "grad_norm": 32.33314895629883,
357
- "kl": 23.102159996827442,
358
- "learning_rate": 3.4096741493194193e-09,
359
- "loss": 0.0243,
360
- "reward": 5.315151680083502,
361
- "reward_std": 1.443944205485639,
362
- "rewards/concensus_correctness_reward_func": 1.9253303380239577,
363
- "rewards/consensus_reward_func": 1.7827380952380953,
364
- "rewards/cumulative_reward_2": 0.0,
365
- "rewards/final_correctness_reward_func": 0.03273809523809524,
366
- "rewards/question_recreation_reward_func": 0.19389274530112743,
367
- "rewards/soft_format_reward_func": 0.0,
368
- "rewards/strict_format_reward_func": 0.33407738095238093,
369
- "rewards/xmlcount_reward_func": 1.0463750177905673,
370
- "step": 95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
371
  },
372
  {
373
- "completion_length": 101.82142857142857,
374
- "epoch": 14.262295081967213,
375
- "grad_norm": 35.32180404663086,
376
- "kl": 51.68701913243248,
377
- "learning_rate": 0.0,
378
- "loss": 0.0543,
379
- "reward": 5.64833756855556,
380
- "reward_std": 1.2089521551416034,
381
- "rewards/concensus_correctness_reward_func": 2.118910712855203,
382
- "rewards/consensus_reward_func": 1.8482142857142858,
383
- "rewards/cumulative_reward_2": 0.0,
384
- "rewards/final_correctness_reward_func": 0.0744047619047619,
385
- "rewards/question_recreation_reward_func": 0.17145216119076526,
386
- "rewards/soft_format_reward_func": 0.0,
387
- "rewards/strict_format_reward_func": 0.34375,
388
- "rewards/xmlcount_reward_func": 1.0916056491079784,
389
- "step": 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
390
  },
391
  {
392
- "epoch": 14.262295081967213,
393
- "step": 100,
394
  "total_flos": 0.0,
395
- "train_loss": 0.17895936399698256,
396
- "train_runtime": 3065.998,
397
- "train_samples_per_second": 4.175,
398
- "train_steps_per_second": 0.033
399
  }
400
  ],
401
- "logging_steps": 5,
402
- "max_steps": 100,
403
- "num_input_tokens_seen": 0,
404
- "num_train_epochs": 15,
405
- "save_steps": 50,
406
  "stateful_callbacks": {
407
  "TrainerControl": {
408
  "args": {
@@ -416,7 +1012,7 @@
416
  }
417
  },
418
  "total_flos": 0.0,
419
- "train_batch_size": 16,
420
  "trial_name": null,
421
  "trial_params": null
422
  }
 
1
  {
2
+ "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 16.666666666666668,
6
  "eval_steps": 500,
7
+ "global_step": 50,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "clip_ratio/high_max": 0.0,
14
+ "clip_ratio/high_mean": 0.0,
15
+ "clip_ratio/low_mean": 0.0,
16
+ "clip_ratio/low_min": 0.0,
17
+ "clip_ratio/region_mean": 0.0,
18
+ "completions/clipped_ratio": 0.0,
19
+ "completions/max_length": 5.5,
20
+ "completions/max_terminated_length": 5.5,
21
+ "completions/mean_length": 4.4375,
22
+ "completions/mean_terminated_length": 4.4375,
23
+ "completions/min_length": 4.0,
24
+ "completions/min_terminated_length": 4.0,
25
+ "epoch": 0.6666666666666666,
26
+ "grad_norm": 1.122226422012318e-06,
27
+ "kl": -1.4901161415892261e-09,
28
+ "learning_rate": 2.5e-07,
29
+ "loss": -0.0,
30
+ "num_tokens": 1607.0,
31
+ "reward": 1.2525351345539093,
32
+ "reward_std": 0.0,
33
+ "rewards/concensus_correctness_reward_func/mean": 1.20250004529953,
34
+ "rewards/concensus_correctness_reward_func/std": 0.9595301449298859,
35
+ "rewards/consensus_reward_func/mean": 0.0,
36
+ "rewards/consensus_reward_func/std": 0.0,
37
+ "rewards/cumulative_reward_2/mean": 0.0,
38
+ "rewards/cumulative_reward_2/std": 0.0,
39
+ "rewards/final_correctness_reward_func/mean": 0.0,
40
+ "rewards/final_correctness_reward_func/std": 0.0,
41
+ "rewards/question_recreation_reward_func/mean": 0.050035042222589254,
42
+ "rewards/question_recreation_reward_func/std": 0.012183232656752807,
43
+ "rewards/soft_format_reward_func/mean": 0.0,
44
+ "rewards/soft_format_reward_func/std": 0.0,
45
+ "rewards/strict_format_reward_func/mean": 0.0,
46
+ "rewards/strict_format_reward_func/std": 0.0,
47
+ "rewards/xmlcount_reward_func/mean": 0.0,
48
+ "rewards/xmlcount_reward_func/std": 0.0,
49
+ "step": 2
50
  },
51
  {
52
+ "clip_ratio/high_max": 0.0,
53
+ "clip_ratio/high_mean": 0.0,
54
+ "clip_ratio/low_mean": 0.0,
55
+ "clip_ratio/low_min": 0.0,
56
+ "clip_ratio/region_mean": 0.0,
57
+ "completions/clipped_ratio": 0.0,
58
+ "completions/max_length": 4.5,
59
+ "completions/max_terminated_length": 4.5,
60
+ "completions/mean_length": 4.125,
61
+ "completions/mean_terminated_length": 4.125,
62
+ "completions/min_length": 4.0,
63
+ "completions/min_terminated_length": 4.0,
64
+ "epoch": 1.3333333333333333,
65
+ "grad_norm": 5.171295561012812e-05,
66
+ "kl": 1.7024576948188042e-07,
67
+ "learning_rate": 4.994647308096508e-07,
68
+ "loss": 0.0,
69
+ "num_tokens": 3209.0,
70
+ "reward": 1.275140255689621,
71
+ "reward_std": 0.0,
72
+ "rewards/concensus_correctness_reward_func/mean": 1.1945000141859055,
73
+ "rewards/concensus_correctness_reward_func/std": 0.1435021311044693,
74
+ "rewards/consensus_reward_func/mean": 0.0,
75
+ "rewards/consensus_reward_func/std": 0.0,
76
+ "rewards/cumulative_reward_2/mean": 0.0,
77
+ "rewards/cumulative_reward_2/std": 0.0,
78
+ "rewards/final_correctness_reward_func/mean": 0.0,
79
+ "rewards/final_correctness_reward_func/std": 0.0,
80
+ "rewards/question_recreation_reward_func/mean": 0.0806402824819088,
81
+ "rewards/question_recreation_reward_func/std": 0.005166131537407637,
82
+ "rewards/soft_format_reward_func/mean": 0.0,
83
+ "rewards/soft_format_reward_func/std": 0.0,
84
+ "rewards/strict_format_reward_func/mean": 0.0,
85
+ "rewards/strict_format_reward_func/std": 0.0,
86
+ "rewards/xmlcount_reward_func/mean": 0.0,
87
+ "rewards/xmlcount_reward_func/std": 0.0,
88
+ "step": 4
89
+ },
90
+ {
91
+ "clip_ratio/high_max": 0.0,
92
+ "clip_ratio/high_mean": 0.0,
93
+ "clip_ratio/low_mean": 0.0,
94
+ "clip_ratio/low_min": 0.0,
95
+ "clip_ratio/region_mean": 0.0,
96
+ "completions/clipped_ratio": 0.0,
97
+ "completions/max_length": 5.0,
98
+ "completions/max_terminated_length": 5.0,
99
+ "completions/mean_length": 4.3125,
100
+ "completions/mean_terminated_length": 4.3125,
101
+ "completions/min_length": 4.0,
102
+ "completions/min_terminated_length": 4.0,
103
+ "epoch": 2.0,
104
+ "grad_norm": 3.3832443513404087e-09,
105
+ "kl": 0.00015763420668690742,
106
+ "learning_rate": 4.951963201008075e-07,
107
+ "loss": 0.0,
108
+ "num_tokens": 4814.0,
109
+ "reward": 0.9683559592813253,
110
+ "reward_std": 0.0,
111
+ "rewards/concensus_correctness_reward_func/mean": 0.9620000123977661,
112
+ "rewards/concensus_correctness_reward_func/std": 0.0,
113
+ "rewards/consensus_reward_func/mean": 0.0,
114
+ "rewards/consensus_reward_func/std": 0.0,
115
+ "rewards/cumulative_reward_2/mean": 0.0,
116
+ "rewards/cumulative_reward_2/std": 0.0,
117
+ "rewards/final_correctness_reward_func/mean": 0.0,
118
+ "rewards/final_correctness_reward_func/std": 0.0,
119
+ "rewards/question_recreation_reward_func/mean": 0.006355952471494675,
120
+ "rewards/question_recreation_reward_func/std": 0.0015629313420504332,
121
+ "rewards/soft_format_reward_func/mean": 0.0,
122
+ "rewards/soft_format_reward_func/std": 0.0,
123
+ "rewards/strict_format_reward_func/mean": 0.0,
124
+ "rewards/strict_format_reward_func/std": 0.0,
125
+ "rewards/xmlcount_reward_func/mean": 0.0,
126
+ "rewards/xmlcount_reward_func/std": 0.0,
127
+ "step": 6
128
+ },
129
+ {
130
+ "clip_ratio/high_max": 0.0,
131
+ "clip_ratio/high_mean": 0.0,
132
+ "clip_ratio/low_mean": 0.0,
133
+ "clip_ratio/low_min": 0.0,
134
+ "clip_ratio/region_mean": 0.0,
135
+ "completions/clipped_ratio": 0.0,
136
+ "completions/max_length": 11.0,
137
+ "completions/max_terminated_length": 11.0,
138
+ "completions/mean_length": 5.0,
139
+ "completions/mean_terminated_length": 5.0,
140
+ "completions/min_length": 4.0,
141
+ "completions/min_terminated_length": 4.0,
142
+ "epoch": 2.6666666666666665,
143
+ "grad_norm": 1.9365202774679346e-09,
144
+ "kl": 0.014343254268169403,
145
+ "learning_rate": 4.867325323737765e-07,
146
+ "loss": -0.0441,
147
+ "num_tokens": 6430.0,
148
+ "reward": 2.1415916681289673,
149
+ "reward_std": 0.0054927063174545765,
150
+ "rewards/concensus_correctness_reward_func/mean": 1.0365000367164612,
151
+ "rewards/concensus_correctness_reward_func/std": 0.4453195035457611,
152
+ "rewards/consensus_reward_func/mean": 1.0,
153
+ "rewards/consensus_reward_func/std": 0.0,
154
+ "rewards/cumulative_reward_2/mean": 0.0,
155
+ "rewards/cumulative_reward_2/std": 0.0,
156
+ "rewards/final_correctness_reward_func/mean": 0.0,
157
+ "rewards/final_correctness_reward_func/std": 0.0,
158
+ "rewards/question_recreation_reward_func/mean": 0.10509166494011879,
159
+ "rewards/question_recreation_reward_func/std": 0.013517891056835651,
160
+ "rewards/soft_format_reward_func/mean": 0.0,
161
+ "rewards/soft_format_reward_func/std": 0.0,
162
+ "rewards/strict_format_reward_func/mean": 0.0,
163
+ "rewards/strict_format_reward_func/std": 0.0,
164
+ "rewards/xmlcount_reward_func/mean": 0.0,
165
+ "rewards/xmlcount_reward_func/std": 0.0,
166
+ "step": 8
167
+ },
168
+ {
169
+ "clip_ratio/high_max": 0.0,
170
+ "clip_ratio/high_mean": 0.0,
171
+ "clip_ratio/low_mean": 0.0,
172
+ "clip_ratio/low_min": 0.0,
173
+ "clip_ratio/region_mean": 0.0,
174
+ "completions/clipped_ratio": 0.0,
175
+ "completions/max_length": 8.0,
176
+ "completions/max_terminated_length": 8.0,
177
+ "completions/mean_length": 5.875,
178
+ "completions/mean_terminated_length": 5.875,
179
+ "completions/min_length": 4.0,
180
+ "completions/min_terminated_length": 4.0,
181
+ "epoch": 3.3333333333333335,
182
+ "grad_norm": 5.078624631948969e-09,
183
+ "kl": 0.15408404730260372,
184
+ "learning_rate": 4.7421818538317203e-07,
185
+ "loss": 0.0039,
186
+ "num_tokens": 8059.0,
187
+ "reward": 1.0263595327269286,
188
+ "reward_std": 0.000949082663282752,
189
+ "rewards/concensus_correctness_reward_func/mean": 0.9620000123977661,
190
+ "rewards/concensus_correctness_reward_func/std": 0.0,
191
+ "rewards/consensus_reward_func/mean": 0.0,
192
+ "rewards/consensus_reward_func/std": 0.0,
193
+ "rewards/cumulative_reward_2/mean": 0.0,
194
+ "rewards/cumulative_reward_2/std": 0.0,
195
+ "rewards/final_correctness_reward_func/mean": 0.0,
196
+ "rewards/final_correctness_reward_func/std": 0.0,
197
+ "rewards/question_recreation_reward_func/mean": 0.06435950938612223,
198
+ "rewards/question_recreation_reward_func/std": 0.0011813289020210505,
199
+ "rewards/soft_format_reward_func/mean": 0.0,
200
+ "rewards/soft_format_reward_func/std": 0.0,
201
+ "rewards/strict_format_reward_func/mean": 0.0,
202
+ "rewards/strict_format_reward_func/std": 0.0,
203
+ "rewards/xmlcount_reward_func/mean": 0.0,
204
+ "rewards/xmlcount_reward_func/std": 0.0,
205
  "step": 10
206
  },
207
  {
208
+ "clip_ratio/high_max": 0.0,
209
+ "clip_ratio/high_mean": 0.0,
210
+ "clip_ratio/low_mean": 0.0,
211
+ "clip_ratio/low_min": 0.0,
212
+ "clip_ratio/region_mean": 0.0,
213
+ "completions/clipped_ratio": 0.0,
214
+ "completions/max_length": 21.5,
215
+ "completions/max_terminated_length": 21.5,
216
+ "completions/mean_length": 8.5625,
217
+ "completions/mean_terminated_length": 8.5625,
218
+ "completions/min_length": 4.0,
219
+ "completions/min_terminated_length": 4.0,
220
+ "epoch": 4.0,
221
+ "grad_norm": 0.0029723152983933687,
222
+ "kl": 0.23341824859380722,
223
+ "learning_rate": 4.578674030756363e-07,
224
+ "loss": -0.0579,
225
+ "num_tokens": 9718.0,
226
+ "reward": 0.8252299129962921,
227
+ "reward_std": 0.006629505660384893,
228
+ "rewards/concensus_correctness_reward_func/mean": 0.7135000079870224,
229
+ "rewards/concensus_correctness_reward_func/std": 0.657712772488594,
230
+ "rewards/consensus_reward_func/mean": 0.0,
231
+ "rewards/consensus_reward_func/std": 0.0,
232
+ "rewards/cumulative_reward_2/mean": 0.0,
233
+ "rewards/cumulative_reward_2/std": 0.0,
234
+ "rewards/final_correctness_reward_func/mean": 0.0,
235
+ "rewards/final_correctness_reward_func/std": 0.0,
236
+ "rewards/question_recreation_reward_func/mean": 0.1117299422621727,
237
+ "rewards/question_recreation_reward_func/std": 0.049328483641147614,
238
+ "rewards/soft_format_reward_func/mean": 0.0,
239
+ "rewards/soft_format_reward_func/std": 0.0,
240
+ "rewards/strict_format_reward_func/mean": 0.0,
241
+ "rewards/strict_format_reward_func/std": 0.0,
242
+ "rewards/xmlcount_reward_func/mean": 0.0,
243
+ "rewards/xmlcount_reward_func/std": 0.0,
244
+ "step": 12
245
  },
246
  {
247
+ "clip_ratio/high_max": 0.0,
248
+ "clip_ratio/high_mean": 0.0,
249
+ "clip_ratio/low_mean": 0.0,
250
+ "clip_ratio/low_min": 0.0,
251
+ "clip_ratio/region_mean": 0.0,
252
+ "completions/clipped_ratio": 0.0,
253
+ "completions/max_length": 11.5,
254
+ "completions/max_terminated_length": 11.5,
255
+ "completions/mean_length": 5.875,
256
+ "completions/mean_terminated_length": 5.875,
257
+ "completions/min_length": 4.0,
258
+ "completions/min_terminated_length": 4.0,
259
+ "epoch": 4.666666666666667,
260
+ "grad_norm": 25.810482025146484,
261
+ "kl": 0.13111786916851997,
262
+ "learning_rate": 4.379599518697443e-07,
263
+ "loss": 0.0001,
264
+ "num_tokens": 11348.0,
265
+ "reward": 1.7566919326782227,
266
+ "reward_std": 0.004746258724480867,
267
+ "rewards/concensus_correctness_reward_func/mean": 0.9562499672174454,
268
+ "rewards/concensus_correctness_reward_func/std": 0.5902103334665298,
269
+ "rewards/consensus_reward_func/mean": 0.75,
270
+ "rewards/consensus_reward_func/std": 0.4629100561141968,
271
+ "rewards/cumulative_reward_2/mean": 0.0,
272
+ "rewards/cumulative_reward_2/std": 0.0,
273
+ "rewards/final_correctness_reward_func/mean": 0.0,
274
+ "rewards/final_correctness_reward_func/std": 0.0,
275
+ "rewards/question_recreation_reward_func/mean": 0.05044195894151926,
276
+ "rewards/question_recreation_reward_func/std": 0.01659557862149086,
277
+ "rewards/soft_format_reward_func/mean": 0.0,
278
+ "rewards/soft_format_reward_func/std": 0.0,
279
+ "rewards/strict_format_reward_func/mean": 0.0,
280
+ "rewards/strict_format_reward_func/std": 0.0,
281
+ "rewards/xmlcount_reward_func/mean": 0.0,
282
+ "rewards/xmlcount_reward_func/std": 0.0,
283
+ "step": 14
284
  },
285
  {
286
+ "clip_ratio/high_max": 0.0,
287
+ "clip_ratio/high_mean": 0.0,
288
+ "clip_ratio/low_mean": 0.0,
289
+ "clip_ratio/low_min": 0.0,
290
+ "clip_ratio/region_mean": 0.0,
291
+ "completions/clipped_ratio": 0.0,
292
+ "completions/max_length": 7.5,
293
+ "completions/max_terminated_length": 7.5,
294
+ "completions/mean_length": 4.875,
295
+ "completions/mean_terminated_length": 4.875,
296
+ "completions/min_length": 4.0,
297
+ "completions/min_terminated_length": 4.0,
298
+ "epoch": 5.333333333333333,
299
+ "grad_norm": 12.974661827087402,
300
+ "kl": 0.16539602912962437,
301
+ "learning_rate": 4.1483645377501717e-07,
302
+ "loss": 0.0002,
303
+ "num_tokens": 12976.0,
304
+ "reward": 1.4967188835144043,
305
+ "reward_std": 0.00440641213208437,
306
+ "rewards/concensus_correctness_reward_func/mean": 1.443000078201294,
307
+ "rewards/concensus_correctness_reward_func/std": 0.8906390070915222,
308
+ "rewards/consensus_reward_func/mean": 0.0,
309
+ "rewards/consensus_reward_func/std": 0.0,
310
+ "rewards/cumulative_reward_2/mean": 0.0,
311
+ "rewards/cumulative_reward_2/std": 0.0,
312
+ "rewards/final_correctness_reward_func/mean": 0.0,
313
+ "rewards/final_correctness_reward_func/std": 0.0,
314
+ "rewards/question_recreation_reward_func/mean": 0.05371885746717453,
315
+ "rewards/question_recreation_reward_func/std": 0.024849189911037683,
316
+ "rewards/soft_format_reward_func/mean": 0.0,
317
+ "rewards/soft_format_reward_func/std": 0.0,
318
+ "rewards/strict_format_reward_func/mean": 0.0,
319
+ "rewards/strict_format_reward_func/std": 0.0,
320
+ "rewards/xmlcount_reward_func/mean": 0.0,
321
+ "rewards/xmlcount_reward_func/std": 0.0,
322
+ "step": 16
323
  },
324
  {
325
+ "clip_ratio/high_max": 0.0,
326
+ "clip_ratio/high_mean": 0.0,
327
+ "clip_ratio/low_mean": 0.0,
328
+ "clip_ratio/low_min": 0.0,
329
+ "clip_ratio/region_mean": 0.0,
330
+ "completions/clipped_ratio": 0.0,
331
+ "completions/max_length": 29.5,
332
+ "completions/max_terminated_length": 29.5,
333
+ "completions/mean_length": 7.625,
334
+ "completions/mean_terminated_length": 7.625,
335
+ "completions/min_length": 4.0,
336
+ "completions/min_terminated_length": 4.0,
337
+ "epoch": 6.0,
338
+ "grad_norm": 21.782949447631836,
339
+ "kl": 0.08358937688171864,
340
+ "learning_rate": 3.8889255825490053e-07,
341
+ "loss": 0.0311,
342
+ "num_tokens": 14657.0,
343
+ "reward": 1.5110957026481628,
344
+ "reward_std": 0.00673344565439038,
345
+ "rewards/concensus_correctness_reward_func/mean": 1.4415000081062317,
346
+ "rewards/concensus_correctness_reward_func/std": 0.8897131681442261,
347
+ "rewards/consensus_reward_func/mean": 0.0,
348
+ "rewards/consensus_reward_func/std": 0.0,
349
+ "rewards/cumulative_reward_2/mean": 0.0,
350
+ "rewards/cumulative_reward_2/std": 0.0,
351
+ "rewards/final_correctness_reward_func/mean": 0.0,
352
+ "rewards/final_correctness_reward_func/std": 0.0,
353
+ "rewards/question_recreation_reward_func/mean": 0.06959578255191445,
354
+ "rewards/question_recreation_reward_func/std": 0.03374439827166498,
355
+ "rewards/soft_format_reward_func/mean": 0.0,
356
+ "rewards/soft_format_reward_func/std": 0.0,
357
+ "rewards/strict_format_reward_func/mean": 0.0,
358
+ "rewards/strict_format_reward_func/std": 0.0,
359
+ "rewards/xmlcount_reward_func/mean": 0.0,
360
+ "rewards/xmlcount_reward_func/std": 0.0,
361
+ "step": 18
362
+ },
363
+ {
364
+ "clip_ratio/high_max": 0.0,
365
+ "clip_ratio/high_mean": 0.0,
366
+ "clip_ratio/low_mean": 0.0,
367
+ "clip_ratio/low_min": 0.0,
368
+ "clip_ratio/region_mean": 0.0,
369
+ "completions/clipped_ratio": 0.0,
370
+ "completions/max_length": 7.5,
371
+ "completions/max_terminated_length": 7.5,
372
+ "completions/mean_length": 4.875,
373
+ "completions/mean_terminated_length": 4.875,
374
+ "completions/min_length": 4.0,
375
+ "completions/min_terminated_length": 4.0,
376
+ "epoch": 6.666666666666667,
377
+ "grad_norm": 3.4635640133728884e-08,
378
+ "kl": 0.08152611553668976,
379
+ "learning_rate": 3.605721725547503e-07,
380
+ "loss": 0.0001,
381
+ "num_tokens": 16271.0,
382
+ "reward": 1.7468912601470947,
383
+ "reward_std": 0.0,
384
+ "rewards/concensus_correctness_reward_func/mean": 1.6819999814033508,
385
+ "rewards/concensus_correctness_reward_func/std": 0.44439366459846497,
386
+ "rewards/consensus_reward_func/mean": 0.0,
387
+ "rewards/consensus_reward_func/std": 0.0,
388
+ "rewards/cumulative_reward_2/mean": 0.0,
389
+ "rewards/cumulative_reward_2/std": 0.0,
390
+ "rewards/final_correctness_reward_func/mean": 0.0,
391
+ "rewards/final_correctness_reward_func/std": 0.0,
392
+ "rewards/question_recreation_reward_func/mean": 0.06489124614745378,
393
+ "rewards/question_recreation_reward_func/std": 0.00020469618903007358,
394
+ "rewards/soft_format_reward_func/mean": 0.0,
395
+ "rewards/soft_format_reward_func/std": 0.0,
396
+ "rewards/strict_format_reward_func/mean": 0.0,
397
+ "rewards/strict_format_reward_func/std": 0.0,
398
+ "rewards/xmlcount_reward_func/mean": 0.0,
399
+ "rewards/xmlcount_reward_func/std": 0.0,
400
+ "step": 20
401
  },
402
  {
403
+ "clip_ratio/high_max": 0.0,
404
+ "clip_ratio/high_mean": 0.0,
405
+ "clip_ratio/low_mean": 0.0,
406
+ "clip_ratio/low_min": 0.0,
407
+ "clip_ratio/region_mean": 0.0,
408
+ "completions/clipped_ratio": 0.0,
409
+ "completions/max_length": 16.0,
410
+ "completions/max_terminated_length": 16.0,
411
+ "completions/mean_length": 10.0,
412
+ "completions/mean_terminated_length": 10.0,
413
+ "completions/min_length": 7.5,
414
+ "completions/min_terminated_length": 7.5,
415
+ "epoch": 7.333333333333333,
416
+ "grad_norm": 4.029274336403432e-08,
417
+ "kl": 0.1582445427775383,
418
+ "learning_rate": 3.3035986632579036e-07,
419
+ "loss": -0.0291,
420
+ "num_tokens": 17919.0,
421
+ "reward": 1.0044002416543663,
422
+ "reward_std": 0.0018568980740383267,
423
+ "rewards/concensus_correctness_reward_func/mean": 0.9620000123977661,
424
+ "rewards/concensus_correctness_reward_func/std": 0.0,
425
+ "rewards/consensus_reward_func/mean": 0.0,
426
+ "rewards/consensus_reward_func/std": 0.0,
427
+ "rewards/cumulative_reward_2/mean": 0.0,
428
+ "rewards/cumulative_reward_2/std": 0.0,
429
+ "rewards/final_correctness_reward_func/mean": 0.0,
430
+ "rewards/final_correctness_reward_func/std": 0.0,
431
+ "rewards/question_recreation_reward_func/mean": 0.04240023344755173,
432
+ "rewards/question_recreation_reward_func/std": 0.003086475422605872,
433
+ "rewards/soft_format_reward_func/mean": 0.0,
434
+ "rewards/soft_format_reward_func/std": 0.0,
435
+ "rewards/strict_format_reward_func/mean": 0.0,
436
+ "rewards/strict_format_reward_func/std": 0.0,
437
+ "rewards/xmlcount_reward_func/mean": 0.0,
438
+ "rewards/xmlcount_reward_func/std": 0.0,
439
+ "step": 22
440
  },
441
  {
442
+ "clip_ratio/high_max": 0.0,
443
+ "clip_ratio/high_mean": 0.0,
444
+ "clip_ratio/low_mean": 0.0,
445
+ "clip_ratio/low_min": 0.0,
446
+ "clip_ratio/region_mean": 0.0,
447
+ "completions/clipped_ratio": 0.0625,
448
+ "completions/max_length": 62.0,
449
+ "completions/max_terminated_length": 36.5,
450
+ "completions/mean_length": 18.0625,
451
+ "completions/mean_terminated_length": 12.678571224212646,
452
+ "completions/min_length": 7.5,
453
+ "completions/min_terminated_length": 7.5,
454
+ "epoch": 8.0,
455
+ "grad_norm": 28.683290481567383,
456
+ "kl": 0.24511529877781868,
457
+ "learning_rate": 2.987725805040321e-07,
458
+ "loss": -0.0032,
459
+ "num_tokens": 19698.0,
460
+ "reward": 1.0137799084186554,
461
+ "reward_std": 0.016448209527879953,
462
+ "rewards/concensus_correctness_reward_func/mean": 0.8765000402927399,
463
+ "rewards/concensus_correctness_reward_func/std": 0.6110214740037918,
464
+ "rewards/consensus_reward_func/mean": 0.0,
465
+ "rewards/consensus_reward_func/std": 0.0,
466
+ "rewards/cumulative_reward_2/mean": 0.0,
467
+ "rewards/cumulative_reward_2/std": 0.0,
468
+ "rewards/final_correctness_reward_func/mean": 0.0,
469
+ "rewards/final_correctness_reward_func/std": 0.0,
470
+ "rewards/question_recreation_reward_func/mean": 0.13727985322475433,
471
+ "rewards/question_recreation_reward_func/std": 0.06551616452634335,
472
+ "rewards/soft_format_reward_func/mean": 0.0,
473
+ "rewards/soft_format_reward_func/std": 0.0,
474
+ "rewards/strict_format_reward_func/mean": 0.0,
475
+ "rewards/strict_format_reward_func/std": 0.0,
476
+ "rewards/xmlcount_reward_func/mean": 0.0,
477
+ "rewards/xmlcount_reward_func/std": 0.0,
478
+ "step": 24
479
  },
480
  {
481
+ "clip_ratio/high_max": 0.0,
482
+ "clip_ratio/high_mean": 0.0,
483
+ "clip_ratio/low_mean": 0.0,
484
+ "clip_ratio/low_min": 0.0,
485
+ "clip_ratio/region_mean": 0.0,
486
+ "completions/clipped_ratio": 0.0625,
487
+ "completions/max_length": 74.0,
488
+ "completions/max_terminated_length": 55.5,
489
+ "completions/mean_length": 19.125,
490
+ "completions/mean_terminated_length": 13.982142925262451,
491
+ "completions/min_length": 4.0,
492
+ "completions/min_terminated_length": 4.0,
493
+ "epoch": 8.666666666666666,
494
+ "grad_norm": 9.308053016662598,
495
+ "kl": 0.2221585288643837,
496
+ "learning_rate": 2.663507823075358e-07,
497
+ "loss": 0.0293,
498
+ "num_tokens": 21540.0,
499
+ "reward": 0.7325176950544119,
500
+ "reward_std": 0.00014589350030291826,
501
+ "rewards/concensus_correctness_reward_func/mean": 0.7199999690055847,
502
+ "rewards/concensus_correctness_reward_func/std": 0.44439366459846497,
503
+ "rewards/consensus_reward_func/mean": 0.0,
504
+ "rewards/consensus_reward_func/std": 0.0,
505
+ "rewards/cumulative_reward_2/mean": 0.0,
506
+ "rewards/cumulative_reward_2/std": 0.0,
507
+ "rewards/final_correctness_reward_func/mean": 0.0,
508
+ "rewards/final_correctness_reward_func/std": 0.0,
509
+ "rewards/question_recreation_reward_func/mean": 0.012517708819359541,
510
+ "rewards/question_recreation_reward_func/std": 0.006235835782717913,
511
+ "rewards/soft_format_reward_func/mean": 0.0,
512
+ "rewards/soft_format_reward_func/std": 0.0,
513
+ "rewards/strict_format_reward_func/mean": 0.0,
514
+ "rewards/strict_format_reward_func/std": 0.0,
515
+ "rewards/xmlcount_reward_func/mean": 0.0,
516
+ "rewards/xmlcount_reward_func/std": 0.0,
517
+ "step": 26
518
  },
519
  {
520
+ "clip_ratio/high_max": 0.0,
521
+ "clip_ratio/high_mean": 0.0,
522
+ "clip_ratio/low_mean": 0.0,
523
+ "clip_ratio/low_min": 0.0,
524
+ "clip_ratio/region_mean": 0.0,
525
+ "completions/clipped_ratio": 0.0625,
526
+ "completions/max_length": 50.0,
527
+ "completions/max_terminated_length": 7.0,
528
+ "completions/mean_length": 10.125,
529
+ "completions/mean_terminated_length": 4.428571462631226,
530
+ "completions/min_length": 4.0,
531
+ "completions/min_terminated_length": 4.0,
532
+ "epoch": 9.333333333333334,
533
+ "grad_norm": 14.845952987670898,
534
+ "kl": 0.054857412353158,
535
+ "learning_rate": 2.336492176924642e-07,
536
+ "loss": -0.0711,
537
+ "num_tokens": 23238.0,
538
+ "reward": 0.969230305403471,
539
+ "reward_std": 0.0014799739001318812,
540
+ "rewards/concensus_correctness_reward_func/mean": 0.9620000123977661,
541
+ "rewards/concensus_correctness_reward_func/std": 0.0,
542
+ "rewards/consensus_reward_func/mean": 0.0,
543
+ "rewards/consensus_reward_func/std": 0.0,
544
+ "rewards/cumulative_reward_2/mean": 0.0,
545
+ "rewards/cumulative_reward_2/std": 0.0,
546
+ "rewards/final_correctness_reward_func/mean": 0.0,
547
+ "rewards/final_correctness_reward_func/std": 0.0,
548
+ "rewards/question_recreation_reward_func/mean": 0.0072302743792533875,
549
+ "rewards/question_recreation_reward_func/std": 0.003834317671135068,
550
+ "rewards/soft_format_reward_func/mean": 0.0,
551
+ "rewards/soft_format_reward_func/std": 0.0,
552
+ "rewards/strict_format_reward_func/mean": 0.0,
553
+ "rewards/strict_format_reward_func/std": 0.0,
554
+ "rewards/xmlcount_reward_func/mean": 0.0,
555
+ "rewards/xmlcount_reward_func/std": 0.0,
556
+ "step": 28
557
  },
558
  {
559
+ "clip_ratio/high_max": 0.0,
560
+ "clip_ratio/high_mean": 0.0,
561
+ "clip_ratio/low_mean": 0.0,
562
+ "clip_ratio/low_min": 0.0,
563
+ "clip_ratio/region_mean": 0.0,
564
+ "completions/clipped_ratio": 0.1875,
565
+ "completions/max_length": 96.0,
566
+ "completions/max_terminated_length": 52.5,
567
+ "completions/mean_length": 31.0,
568
+ "completions/mean_terminated_length": 16.547618865966797,
569
+ "completions/min_length": 4.0,
570
+ "completions/min_terminated_length": 4.0,
571
+ "epoch": 10.0,
572
+ "grad_norm": 8.113728523254395,
573
+ "kl": 0.1079877857118845,
574
+ "learning_rate": 2.0122741949596793e-07,
575
+ "loss": 0.0528,
576
+ "num_tokens": 25119.0,
577
+ "reward": 0.7863147500902414,
578
+ "reward_std": 0.007107729441486299,
579
+ "rewards/concensus_correctness_reward_func/mean": 0.721500039100647,
580
+ "rewards/concensus_correctness_reward_func/std": 0.4453195035457611,
581
+ "rewards/consensus_reward_func/mean": 0.0,
582
+ "rewards/consensus_reward_func/std": 0.0,
583
+ "rewards/cumulative_reward_2/mean": 0.0,
584
+ "rewards/cumulative_reward_2/std": 0.0,
585
+ "rewards/final_correctness_reward_func/mean": 0.0,
586
+ "rewards/final_correctness_reward_func/std": 0.0,
587
+ "rewards/question_recreation_reward_func/mean": 0.06481470353901386,
588
+ "rewards/question_recreation_reward_func/std": 0.043396634981036186,
589
+ "rewards/soft_format_reward_func/mean": 0.0,
590
+ "rewards/soft_format_reward_func/std": 0.0,
591
+ "rewards/strict_format_reward_func/mean": 0.0,
592
+ "rewards/strict_format_reward_func/std": 0.0,
593
+ "rewards/xmlcount_reward_func/mean": 0.0,
594
+ "rewards/xmlcount_reward_func/std": 0.0,
595
+ "step": 30
596
  },
597
  {
598
+ "clip_ratio/high_max": 0.0,
599
+ "clip_ratio/high_mean": 0.0,
600
+ "clip_ratio/low_mean": 0.0,
601
+ "clip_ratio/low_min": 0.0,
602
+ "clip_ratio/region_mean": 0.0,
603
+ "completions/clipped_ratio": 0.0,
604
+ "completions/max_length": 36.5,
605
+ "completions/max_terminated_length": 36.5,
606
+ "completions/mean_length": 14.6875,
607
+ "completions/mean_terminated_length": 14.6875,
608
+ "completions/min_length": 4.0,
609
+ "completions/min_terminated_length": 4.0,
610
+ "epoch": 10.666666666666666,
611
+ "grad_norm": 20.759033203125,
612
+ "kl": 0.2695002183318138,
613
+ "learning_rate": 1.6964013367420965e-07,
614
+ "loss": -0.0615,
615
+ "num_tokens": 26890.0,
616
+ "reward": 0.9509141445159912,
617
+ "reward_std": 0.0098513662815094,
618
+ "rewards/concensus_correctness_reward_func/mean": 0.8749999701976776,
619
+ "rewards/concensus_correctness_reward_func/std": 0.6100956350564957,
620
+ "rewards/consensus_reward_func/mean": 0.0,
621
+ "rewards/consensus_reward_func/std": 0.0,
622
+ "rewards/cumulative_reward_2/mean": 0.0,
623
+ "rewards/cumulative_reward_2/std": 0.0,
624
+ "rewards/final_correctness_reward_func/mean": 0.0,
625
+ "rewards/final_correctness_reward_func/std": 0.0,
626
+ "rewards/question_recreation_reward_func/mean": 0.07591419294476509,
627
+ "rewards/question_recreation_reward_func/std": 0.047521001601126045,
628
+ "rewards/soft_format_reward_func/mean": 0.0,
629
+ "rewards/soft_format_reward_func/std": 0.0,
630
+ "rewards/strict_format_reward_func/mean": 0.0,
631
+ "rewards/strict_format_reward_func/std": 0.0,
632
+ "rewards/xmlcount_reward_func/mean": 0.0,
633
+ "rewards/xmlcount_reward_func/std": 0.0,
634
+ "step": 32
635
  },
636
  {
637
+ "clip_ratio/high_max": 0.0,
638
+ "clip_ratio/high_mean": 0.0,
639
+ "clip_ratio/low_mean": 0.0,
640
+ "clip_ratio/low_min": 0.0,
641
+ "clip_ratio/region_mean": 0.0,
642
+ "completions/clipped_ratio": 0.0625,
643
+ "completions/max_length": 50.0,
644
+ "completions/max_terminated_length": 14.0,
645
+ "completions/mean_length": 11.0,
646
+ "completions/mean_terminated_length": 5.428571462631226,
647
+ "completions/min_length": 4.0,
648
+ "completions/min_terminated_length": 4.0,
649
+ "epoch": 11.333333333333334,
650
+ "grad_norm": 10.727737426757812,
651
+ "kl": 0.05535062029957771,
652
+ "learning_rate": 1.3942782744524973e-07,
653
+ "loss": 0.053,
654
+ "num_tokens": 28602.0,
655
+ "reward": 1.749426543712616,
656
+ "reward_std": 0.008744525723159313,
657
+ "rewards/concensus_correctness_reward_func/mean": 1.6775000095367432,
658
+ "rewards/concensus_correctness_reward_func/std": 0.4453195035457611,
659
+ "rewards/consensus_reward_func/mean": 0.0,
660
+ "rewards/consensus_reward_func/std": 0.0,
661
+ "rewards/cumulative_reward_2/mean": 0.0,
662
+ "rewards/cumulative_reward_2/std": 0.0,
663
+ "rewards/final_correctness_reward_func/mean": 0.0,
664
+ "rewards/final_correctness_reward_func/std": 0.0,
665
+ "rewards/question_recreation_reward_func/mean": 0.07192660123109818,
666
+ "rewards/question_recreation_reward_func/std": 0.028842128813266754,
667
+ "rewards/soft_format_reward_func/mean": 0.0,
668
+ "rewards/soft_format_reward_func/std": 0.0,
669
+ "rewards/strict_format_reward_func/mean": 0.0,
670
+ "rewards/strict_format_reward_func/std": 0.0,
671
+ "rewards/xmlcount_reward_func/mean": 0.0,
672
+ "rewards/xmlcount_reward_func/std": 0.0,
673
+ "step": 34
674
  },
675
  {
676
+ "clip_ratio/high_max": 0.0,
677
+ "clip_ratio/high_mean": 0.0,
678
+ "clip_ratio/low_mean": 0.0,
679
+ "clip_ratio/low_min": 0.0,
680
+ "clip_ratio/region_mean": 0.0,
681
+ "completions/clipped_ratio": 0.0,
682
+ "completions/max_length": 39.0,
683
+ "completions/max_terminated_length": 39.0,
684
+ "completions/mean_length": 14.1875,
685
+ "completions/mean_terminated_length": 14.1875,
686
+ "completions/min_length": 4.0,
687
+ "completions/min_terminated_length": 4.0,
688
+ "epoch": 12.0,
689
+ "grad_norm": 4.972465603714227e-07,
690
+ "kl": 0.09537366032600403,
691
+ "learning_rate": 1.1110744174509951e-07,
692
+ "loss": -0.0332,
693
+ "num_tokens": 30365.0,
694
+ "reward": 0.9711364028044045,
695
+ "reward_std": 0.000936258933506906,
696
+ "rewards/concensus_correctness_reward_func/mean": 0.9599999785423279,
697
+ "rewards/concensus_correctness_reward_func/std": 0.0,
698
+ "rewards/consensus_reward_func/mean": 0.0,
699
+ "rewards/consensus_reward_func/std": 0.0,
700
+ "rewards/cumulative_reward_2/mean": 0.0,
701
+ "rewards/cumulative_reward_2/std": 0.0,
702
+ "rewards/final_correctness_reward_func/mean": 0.0,
703
+ "rewards/final_correctness_reward_func/std": 0.0,
704
+ "rewards/question_recreation_reward_func/mean": 0.01113644428551197,
705
+ "rewards/question_recreation_reward_func/std": 0.004477651324123144,
706
+ "rewards/soft_format_reward_func/mean": 0.0,
707
+ "rewards/soft_format_reward_func/std": 0.0,
708
+ "rewards/strict_format_reward_func/mean": 0.0,
709
+ "rewards/strict_format_reward_func/std": 0.0,
710
+ "rewards/xmlcount_reward_func/mean": 0.0,
711
+ "rewards/xmlcount_reward_func/std": 0.0,
712
+ "step": 36
713
  },
714
  {
715
+ "clip_ratio/high_max": 0.0,
716
+ "clip_ratio/high_mean": 0.0,
717
+ "clip_ratio/low_mean": 0.0,
718
+ "clip_ratio/low_min": 0.0,
719
+ "clip_ratio/region_mean": 0.0,
720
+ "completions/clipped_ratio": 0.0,
721
+ "completions/max_length": 26.0,
722
+ "completions/max_terminated_length": 26.0,
723
+ "completions/mean_length": 8.8125,
724
+ "completions/mean_terminated_length": 8.8125,
725
+ "completions/min_length": 4.0,
726
+ "completions/min_terminated_length": 4.0,
727
+ "epoch": 12.666666666666666,
728
+ "grad_norm": 2.598005144349713e-09,
729
+ "kl": 0.07045537792146206,
730
+ "learning_rate": 8.516354622498278e-08,
731
+ "loss": 0.0101,
732
+ "num_tokens": 32042.0,
733
+ "reward": 1.0144791088532656,
734
+ "reward_std": 8.961161074694246e-05,
735
+ "rewards/concensus_correctness_reward_func/mean": 0.9620000123977661,
736
+ "rewards/concensus_correctness_reward_func/std": 0.0,
737
+ "rewards/consensus_reward_func/mean": 0.0,
738
+ "rewards/consensus_reward_func/std": 0.0,
739
+ "rewards/cumulative_reward_2/mean": 0.0,
740
+ "rewards/cumulative_reward_2/std": 0.0,
741
+ "rewards/final_correctness_reward_func/mean": 0.0,
742
+ "rewards/final_correctness_reward_func/std": 0.0,
743
+ "rewards/question_recreation_reward_func/mean": 0.05247914115898311,
744
+ "rewards/question_recreation_reward_func/std": 0.0014408096903935075,
745
+ "rewards/soft_format_reward_func/mean": 0.0,
746
+ "rewards/soft_format_reward_func/std": 0.0,
747
+ "rewards/strict_format_reward_func/mean": 0.0,
748
+ "rewards/strict_format_reward_func/std": 0.0,
749
+ "rewards/xmlcount_reward_func/mean": 0.0,
750
+ "rewards/xmlcount_reward_func/std": 0.0,
751
+ "step": 38
752
  },
753
  {
754
+ "clip_ratio/high_max": 0.0,
755
+ "clip_ratio/high_mean": 0.0,
756
+ "clip_ratio/low_mean": 0.0,
757
+ "clip_ratio/low_min": 0.0,
758
+ "clip_ratio/region_mean": 0.0,
759
+ "completions/clipped_ratio": 0.125,
760
+ "completions/max_length": 84.5,
761
+ "completions/max_terminated_length": 47.0,
762
+ "completions/mean_length": 24.9375,
763
+ "completions/mean_terminated_length": 14.145833492279053,
764
+ "completions/min_length": 4.0,
765
+ "completions/min_terminated_length": 4.0,
766
+ "epoch": 13.333333333333334,
767
+ "grad_norm": 11.580063819885254,
768
+ "kl": 0.26880926452577114,
769
+ "learning_rate": 6.204004813025567e-08,
770
+ "loss": -0.0378,
771
+ "num_tokens": 33920.0,
772
+ "reward": 0.016987387090921402,
773
+ "reward_std": 0.001602659816853702,
774
+ "rewards/concensus_correctness_reward_func/mean": 0.0,
775
+ "rewards/concensus_correctness_reward_func/std": 0.0,
776
+ "rewards/consensus_reward_func/mean": 0.0,
777
+ "rewards/consensus_reward_func/std": 0.0,
778
+ "rewards/cumulative_reward_2/mean": 0.0,
779
+ "rewards/cumulative_reward_2/std": 0.0,
780
+ "rewards/final_correctness_reward_func/mean": 0.0,
781
+ "rewards/final_correctness_reward_func/std": 0.0,
782
+ "rewards/question_recreation_reward_func/mean": 0.016987387090921402,
783
+ "rewards/question_recreation_reward_func/std": 0.012470217421650887,
784
+ "rewards/soft_format_reward_func/mean": 0.0,
785
+ "rewards/soft_format_reward_func/std": 0.0,
786
+ "rewards/strict_format_reward_func/mean": 0.0,
787
+ "rewards/strict_format_reward_func/std": 0.0,
788
+ "rewards/xmlcount_reward_func/mean": 0.0,
789
+ "rewards/xmlcount_reward_func/std": 0.0,
790
+ "step": 40
791
+ },
792
+ {
793
+ "clip_ratio/high_max": 0.0,
794
+ "clip_ratio/high_mean": 0.0,
795
+ "clip_ratio/low_mean": 0.0,
796
+ "clip_ratio/low_min": 0.0,
797
+ "clip_ratio/region_mean": 0.0,
798
+ "completions/clipped_ratio": 0.0,
799
+ "completions/max_length": 16.0,
800
+ "completions/max_terminated_length": 16.0,
801
+ "completions/mean_length": 9.875,
802
+ "completions/mean_terminated_length": 9.875,
803
+ "completions/min_length": 4.0,
804
+ "completions/min_terminated_length": 4.0,
805
+ "epoch": 14.0,
806
+ "grad_norm": 13.555513381958008,
807
+ "kl": 0.13996858708560467,
808
+ "learning_rate": 4.213259692436366e-08,
809
+ "loss": 0.025,
810
+ "num_tokens": 35711.0,
811
+ "reward": 1.238945186138153,
812
+ "reward_std": 0.019770290702581406,
813
+ "rewards/concensus_correctness_reward_func/mean": 1.117000013589859,
814
+ "rewards/concensus_correctness_reward_func/std": 0.1657019704580307,
815
+ "rewards/consensus_reward_func/mean": 0.0,
816
+ "rewards/consensus_reward_func/std": 0.0,
817
+ "rewards/cumulative_reward_2/mean": 0.0,
818
+ "rewards/cumulative_reward_2/std": 0.0,
819
+ "rewards/final_correctness_reward_func/mean": 0.0,
820
+ "rewards/final_correctness_reward_func/std": 0.0,
821
+ "rewards/question_recreation_reward_func/mean": 0.121945109218359,
822
+ "rewards/question_recreation_reward_func/std": 0.049046434462070465,
823
+ "rewards/soft_format_reward_func/mean": 0.0,
824
+ "rewards/soft_format_reward_func/std": 0.0,
825
+ "rewards/strict_format_reward_func/mean": 0.0,
826
+ "rewards/strict_format_reward_func/std": 0.0,
827
+ "rewards/xmlcount_reward_func/mean": 0.0,
828
+ "rewards/xmlcount_reward_func/std": 0.0,
829
+ "step": 42
830
  },
831
  {
832
+ "clip_ratio/high_max": 0.0,
833
+ "clip_ratio/high_mean": 0.0,
834
+ "clip_ratio/low_mean": 0.0,
835
+ "clip_ratio/low_min": 0.0,
836
+ "clip_ratio/region_mean": 0.0,
837
+ "completions/clipped_ratio": 0.0625,
838
+ "completions/max_length": 50.0,
839
+ "completions/max_terminated_length": 33.0,
840
+ "completions/mean_length": 13.375,
841
+ "completions/mean_terminated_length": 8.142857074737549,
842
+ "completions/min_length": 4.0,
843
+ "completions/min_terminated_length": 4.0,
844
+ "epoch": 14.666666666666666,
845
+ "grad_norm": 6.8140435516284015e-09,
846
+ "kl": 0.0543803870677948,
847
+ "learning_rate": 2.5781814616827933e-08,
848
+ "loss": -0.0189,
849
+ "num_tokens": 37461.0,
850
+ "reward": 1.7509318590164185,
851
+ "reward_std": 0.0020830321591347456,
852
+ "rewards/concensus_correctness_reward_func/mean": 1.683500051498413,
853
+ "rewards/concensus_correctness_reward_func/std": 0.4453195035457611,
854
+ "rewards/consensus_reward_func/mean": 0.0,
855
+ "rewards/consensus_reward_func/std": 0.0,
856
+ "rewards/cumulative_reward_2/mean": 0.0,
857
+ "rewards/cumulative_reward_2/std": 0.0,
858
+ "rewards/final_correctness_reward_func/mean": 0.0,
859
+ "rewards/final_correctness_reward_func/std": 0.0,
860
+ "rewards/question_recreation_reward_func/mean": 0.06743180518969893,
861
+ "rewards/question_recreation_reward_func/std": 0.004411348141729832,
862
+ "rewards/soft_format_reward_func/mean": 0.0,
863
+ "rewards/soft_format_reward_func/std": 0.0,
864
+ "rewards/strict_format_reward_func/mean": 0.0,
865
+ "rewards/strict_format_reward_func/std": 0.0,
866
+ "rewards/xmlcount_reward_func/mean": 0.0,
867
+ "rewards/xmlcount_reward_func/std": 0.0,
868
+ "step": 44
869
  },
870
  {
871
+ "clip_ratio/high_max": 0.0,
872
+ "clip_ratio/high_mean": 0.0,
873
+ "clip_ratio/low_mean": 0.0,
874
+ "clip_ratio/low_min": 0.0,
875
+ "clip_ratio/region_mean": 0.0,
876
+ "completions/clipped_ratio": 0.125,
877
+ "completions/max_length": 93.0,
878
+ "completions/max_terminated_length": 72.5,
879
+ "completions/mean_length": 32.5625,
880
+ "completions/mean_terminated_length": 22.583333015441895,
881
+ "completions/min_length": 4.0,
882
+ "completions/min_terminated_length": 4.0,
883
+ "epoch": 15.333333333333334,
884
+ "grad_norm": 14.194186210632324,
885
+ "kl": 0.33030909672379494,
886
+ "learning_rate": 1.3267467626223605e-08,
887
+ "loss": 0.0271,
888
+ "num_tokens": 39502.0,
889
+ "reward": 0.019012368749827147,
890
+ "reward_std": 0.0037862203316763043,
891
+ "rewards/concensus_correctness_reward_func/mean": 0.0,
892
+ "rewards/concensus_correctness_reward_func/std": 0.0,
893
+ "rewards/consensus_reward_func/mean": 0.0,
894
+ "rewards/consensus_reward_func/std": 0.0,
895
+ "rewards/cumulative_reward_2/mean": 0.0,
896
+ "rewards/cumulative_reward_2/std": 0.0,
897
+ "rewards/final_correctness_reward_func/mean": 0.0,
898
+ "rewards/final_correctness_reward_func/std": 0.0,
899
+ "rewards/question_recreation_reward_func/mean": 0.019012369215488434,
900
+ "rewards/question_recreation_reward_func/std": 0.014280529227107763,
901
+ "rewards/soft_format_reward_func/mean": 0.0,
902
+ "rewards/soft_format_reward_func/std": 0.0,
903
+ "rewards/strict_format_reward_func/mean": 0.0,
904
+ "rewards/strict_format_reward_func/std": 0.0,
905
+ "rewards/xmlcount_reward_func/mean": 0.0,
906
+ "rewards/xmlcount_reward_func/std": 0.0,
907
+ "step": 46
908
  },
909
  {
910
+ "clip_ratio/high_max": 0.0,
911
+ "clip_ratio/high_mean": 0.0,
912
+ "clip_ratio/low_mean": 0.0,
913
+ "clip_ratio/low_min": 0.0,
914
+ "clip_ratio/region_mean": 0.0,
915
+ "completions/clipped_ratio": 0.25,
916
+ "completions/max_length": 50.0,
917
+ "completions/max_terminated_length": 4.0,
918
+ "completions/mean_length": 27.0,
919
+ "completions/mean_terminated_length": 4.0,
920
+ "completions/min_length": 4.0,
921
+ "completions/min_terminated_length": 4.0,
922
+ "epoch": 16.0,
923
+ "grad_norm": 6.133322715759277,
924
+ "kl": 0.044046737253665924,
925
+ "learning_rate": 4.803679899192392e-09,
926
+ "loss": 0.0,
927
+ "num_tokens": 41286.0,
928
+ "reward": 0.9705088818445802,
929
+ "reward_std": 0.0009944201447069645,
930
+ "rewards/concensus_correctness_reward_func/mean": 0.9599999785423279,
931
+ "rewards/concensus_correctness_reward_func/std": 0.0,
932
+ "rewards/consensus_reward_func/mean": 0.0,
933
+ "rewards/consensus_reward_func/std": 0.0,
934
+ "rewards/cumulative_reward_2/mean": 0.0,
935
+ "rewards/cumulative_reward_2/std": 0.0,
936
+ "rewards/final_correctness_reward_func/mean": 0.0,
937
+ "rewards/final_correctness_reward_func/std": 0.0,
938
+ "rewards/question_recreation_reward_func/mean": 0.010508923325687647,
939
+ "rewards/question_recreation_reward_func/std": 0.004894755315035582,
940
+ "rewards/soft_format_reward_func/mean": 0.0,
941
+ "rewards/soft_format_reward_func/std": 0.0,
942
+ "rewards/strict_format_reward_func/mean": 0.0,
943
+ "rewards/strict_format_reward_func/std": 0.0,
944
+ "rewards/xmlcount_reward_func/mean": 0.0,
945
+ "rewards/xmlcount_reward_func/std": 0.0,
946
+ "step": 48
947
  },
948
  {
949
+ "clip_ratio/high_max": 0.0,
950
+ "clip_ratio/high_mean": 0.0,
951
+ "clip_ratio/low_mean": 0.0,
952
+ "clip_ratio/low_min": 0.0,
953
+ "clip_ratio/region_mean": 0.0,
954
+ "completions/clipped_ratio": 0.1875,
955
+ "completions/max_length": 96.0,
956
+ "completions/max_terminated_length": 43.0,
957
+ "completions/mean_length": 28.6875,
958
+ "completions/mean_terminated_length": 13.523809909820557,
959
+ "completions/min_length": 4.0,
960
+ "completions/min_terminated_length": 4.0,
961
+ "epoch": 16.666666666666668,
962
+ "grad_norm": 7.58546257019043,
963
+ "kl": 0.1461558025330305,
964
+ "learning_rate": 5.352691903491303e-10,
965
+ "loss": -0.0349,
966
+ "num_tokens": 43281.0,
967
+ "reward": 0.4925380670465529,
968
+ "reward_std": 0.0025778122944757342,
969
+ "rewards/concensus_correctness_reward_func/mean": 0.48100000619888306,
970
+ "rewards/concensus_correctness_reward_func/std": 0.5142106413841248,
971
+ "rewards/consensus_reward_func/mean": 0.0,
972
+ "rewards/consensus_reward_func/std": 0.0,
973
+ "rewards/cumulative_reward_2/mean": 0.0,
974
+ "rewards/cumulative_reward_2/std": 0.0,
975
+ "rewards/final_correctness_reward_func/mean": 0.0,
976
+ "rewards/final_correctness_reward_func/std": 0.0,
977
+ "rewards/question_recreation_reward_func/mean": 0.011538045946508646,
978
+ "rewards/question_recreation_reward_func/std": 0.008576579857617617,
979
+ "rewards/soft_format_reward_func/mean": 0.0,
980
+ "rewards/soft_format_reward_func/std": 0.0,
981
+ "rewards/strict_format_reward_func/mean": 0.0,
982
+ "rewards/strict_format_reward_func/std": 0.0,
983
+ "rewards/xmlcount_reward_func/mean": 0.0,
984
+ "rewards/xmlcount_reward_func/std": 0.0,
985
+ "step": 50
986
  },
987
  {
988
+ "epoch": 16.666666666666668,
989
+ "step": 50,
990
  "total_flos": 0.0,
991
+ "train_loss": -0.006354737589201066,
992
+ "train_runtime": 2680.4196,
993
+ "train_samples_per_second": 0.149,
994
+ "train_steps_per_second": 0.019
995
  }
996
  ],
997
+ "logging_steps": 2,
998
+ "max_steps": 50,
999
+ "num_input_tokens_seen": 43281,
1000
+ "num_train_epochs": 17,
1001
+ "save_steps": 25,
1002
  "stateful_callbacks": {
1003
  "TrainerControl": {
1004
  "args": {
 
1012
  }
1013
  },
1014
  "total_flos": 0.0,
1015
+ "train_batch_size": 2,
1016
  "trial_name": null,
1017
  "trial_params": null
1018
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f61e7cd63625c52f24373a35fc8911f78375028dddc9c9ee533d76f160cb57ec
3
- size 5944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aab747cd8440c28c67e9fdfa8df6ca5ce16d51dfbea95f302c5d05f586371c45
3
+ size 6801