Blancy commited on
Commit
1dcb3d0
·
verified ·
1 Parent(s): 6ab119d

Model save

Browse files
README.md ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen3-0.6B
3
+ library_name: transformers
4
+ model_name: Qwen3-0.6B-Open-R1-GRPO
5
+ tags:
6
+ - generated_from_trainer
7
+ - trl
8
+ - grpo
9
+ licence: license
10
+ ---
11
+
12
+ # Model Card for Qwen3-0.6B-Open-R1-GRPO
13
+
14
+ This model is a fine-tuned version of [Qwen/Qwen3-0.6B](https://huggingface.co/Qwen/Qwen3-0.6B).
15
+ It has been trained using [TRL](https://github.com/huggingface/trl).
16
+
17
+ ## Quick start
18
+
19
+ ```python
20
+ from transformers import pipeline
21
+
22
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
23
+ generator = pipeline("text-generation", model="Blancy/Qwen3-0.6B-Open-R1-GRPO", device="cuda")
24
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
25
+ print(output["generated_text"])
26
+ ```
27
+
28
+ ## Training procedure
29
+
30
+
31
+
32
+
33
+ This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
34
+
35
+ ### Framework versions
36
+
37
+ - TRL: 0.15.2
38
+ - Transformers: 4.52.3
39
+ - Pytorch: 2.5.1
40
+ - Datasets: 3.6.0
41
+ - Tokenizers: 0.21.1
42
+
43
+ ## Citations
44
+
45
+ Cite GRPO as:
46
+
47
+ ```bibtex
48
+ @article{zhihong2024deepseekmath,
49
+ title = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
50
+ author = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
51
+ year = 2024,
52
+ eprint = {arXiv:2402.03300},
53
+ }
54
+
55
+ ```
56
+
57
+ Cite TRL as:
58
+
59
+ ```bibtex
60
+ @misc{vonwerra2022trl,
61
+ title = {{TRL: Transformer Reinforcement Learning}},
62
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
63
+ year = 2020,
64
+ journal = {GitHub repository},
65
+ publisher = {GitHub},
66
+ howpublished = {\url{https://github.com/huggingface/trl}}
67
+ }
68
+ ```
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 0.0,
3
+ "train_loss": 0.014487237919349916,
4
+ "train_runtime": 13938.4252,
5
+ "train_samples": 1000,
6
+ "train_samples_per_second": 0.072,
7
+ "train_steps_per_second": 0.003
8
+ }
generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "temperature": 0.6,
10
+ "top_k": 20,
11
+ "top_p": 0.95,
12
+ "transformers_version": "4.52.3"
13
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 0.0,
3
+ "train_loss": 0.014487237919349916,
4
+ "train_runtime": 13938.4252,
5
+ "train_samples": 1000,
6
+ "train_samples_per_second": 0.072,
7
+ "train_steps_per_second": 0.003
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,547 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 36,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "completion_length": 3737.096435546875,
14
+ "epoch": 0.027972027972027972,
15
+ "grad_norm": 0.4781525433063507,
16
+ "kl": 0.0,
17
+ "learning_rate": 0.0,
18
+ "loss": 0.0,
19
+ "reward": 0.49642856419086456,
20
+ "reward_std": 0.14968311414122581,
21
+ "rewards/accuracy_reward": 0.09285714384168386,
22
+ "rewards/format_reward": 0.0,
23
+ "rewards/tag_count_reward": 0.4035714343190193,
24
+ "step": 1
25
+ },
26
+ {
27
+ "completion_length": 3919.4143676757812,
28
+ "epoch": 0.055944055944055944,
29
+ "grad_norm": 0.8725160360336304,
30
+ "kl": 0.0,
31
+ "learning_rate": 2.5e-06,
32
+ "loss": 0.0,
33
+ "reward": 0.41160714626312256,
34
+ "reward_std": 0.1353183900937438,
35
+ "rewards/accuracy_reward": 0.05000000260770321,
36
+ "rewards/format_reward": 0.0,
37
+ "rewards/tag_count_reward": 0.3616071417927742,
38
+ "step": 2
39
+ },
40
+ {
41
+ "completion_length": 3489.6964721679688,
42
+ "epoch": 0.08391608391608392,
43
+ "grad_norm": 0.5938112735748291,
44
+ "kl": 0.000396728515625,
45
+ "learning_rate": 5e-06,
46
+ "loss": 0.0,
47
+ "reward": 0.5473214313387871,
48
+ "reward_std": 0.15705320611596107,
49
+ "rewards/accuracy_reward": 0.11785714374855161,
50
+ "rewards/format_reward": 0.0,
51
+ "rewards/tag_count_reward": 0.42946429550647736,
52
+ "step": 3
53
+ },
54
+ {
55
+ "completion_length": 3473.7571411132812,
56
+ "epoch": 0.11188811188811189,
57
+ "grad_norm": 0.5854588747024536,
58
+ "kl": 0.0013608932495117188,
59
+ "learning_rate": 7.500000000000001e-06,
60
+ "loss": 0.0001,
61
+ "reward": 0.5562500059604645,
62
+ "reward_std": 0.17247037403285503,
63
+ "rewards/accuracy_reward": 0.14285714388825,
64
+ "rewards/format_reward": 0.0,
65
+ "rewards/tag_count_reward": 0.4133928641676903,
66
+ "step": 4
67
+ },
68
+ {
69
+ "completion_length": 3737.9000244140625,
70
+ "epoch": 0.13986013986013987,
71
+ "grad_norm": 8.259861946105957,
72
+ "kl": 0.006000518798828125,
73
+ "learning_rate": 1e-05,
74
+ "loss": 0.0002,
75
+ "reward": 0.5544642880558968,
76
+ "reward_std": 0.19513687305152416,
77
+ "rewards/accuracy_reward": 0.157142860814929,
78
+ "rewards/format_reward": 0.0,
79
+ "rewards/tag_count_reward": 0.3973214253783226,
80
+ "step": 5
81
+ },
82
+ {
83
+ "completion_length": 3746.992919921875,
84
+ "epoch": 0.16783216783216784,
85
+ "grad_norm": 0.48524144291877747,
86
+ "kl": 0.0187225341796875,
87
+ "learning_rate": 9.978331270024887e-06,
88
+ "loss": 0.0007,
89
+ "reward": 0.5526785850524902,
90
+ "reward_std": 0.27357756346464157,
91
+ "rewards/accuracy_reward": 0.12142857350409031,
92
+ "rewards/format_reward": 0.0,
93
+ "rewards/tag_count_reward": 0.4312500059604645,
94
+ "step": 6
95
+ },
96
+ {
97
+ "completion_length": 3918.8643188476562,
98
+ "epoch": 0.1958041958041958,
99
+ "grad_norm": 0.29106539487838745,
100
+ "kl": 0.025238037109375,
101
+ "learning_rate": 9.913533761814537e-06,
102
+ "loss": 0.001,
103
+ "reward": 0.5473214462399483,
104
+ "reward_std": 0.2193572111427784,
105
+ "rewards/accuracy_reward": 0.12857143231667578,
106
+ "rewards/format_reward": 0.0,
107
+ "rewards/tag_count_reward": 0.41875000298023224,
108
+ "step": 7
109
+ },
110
+ {
111
+ "completion_length": 3901.175048828125,
112
+ "epoch": 0.22377622377622378,
113
+ "grad_norm": 0.34263718128204346,
114
+ "kl": 0.0513916015625,
115
+ "learning_rate": 9.80623151079494e-06,
116
+ "loss": 0.0021,
117
+ "reward": 0.4044642969965935,
118
+ "reward_std": 0.12892552372068167,
119
+ "rewards/accuracy_reward": 0.04642857238650322,
120
+ "rewards/format_reward": 0.0,
121
+ "rewards/tag_count_reward": 0.35803572088479996,
122
+ "step": 8
123
+ },
124
+ {
125
+ "completion_length": 3889.1786499023438,
126
+ "epoch": 0.2517482517482518,
127
+ "grad_norm": 1.2209001779556274,
128
+ "kl": 0.0618896484375,
129
+ "learning_rate": 9.65745789630079e-06,
130
+ "loss": 0.0025,
131
+ "reward": 0.48303572088479996,
132
+ "reward_std": 0.21392671391367912,
133
+ "rewards/accuracy_reward": 0.11428571795113385,
134
+ "rewards/format_reward": 0.0,
135
+ "rewards/tag_count_reward": 0.3687500059604645,
136
+ "step": 9
137
+ },
138
+ {
139
+ "completion_length": 4003.5607299804688,
140
+ "epoch": 0.27972027972027974,
141
+ "grad_norm": 0.3369362950325012,
142
+ "kl": 0.068359375,
143
+ "learning_rate": 9.468645689567599e-06,
144
+ "loss": 0.0027,
145
+ "reward": 0.46339286118745804,
146
+ "reward_std": 0.22164541110396385,
147
+ "rewards/accuracy_reward": 0.09285714570432901,
148
+ "rewards/format_reward": 0.0,
149
+ "rewards/tag_count_reward": 0.3705357164144516,
150
+ "step": 10
151
+ },
152
+ {
153
+ "completion_length": 3949.378662109375,
154
+ "epoch": 0.3076923076923077,
155
+ "grad_norm": 0.7158792018890381,
156
+ "kl": 0.0810546875,
157
+ "learning_rate": 9.241613255361455e-06,
158
+ "loss": 0.0032,
159
+ "reward": 0.4375000074505806,
160
+ "reward_std": 0.1808728687465191,
161
+ "rewards/accuracy_reward": 0.08214285899884999,
162
+ "rewards/format_reward": 0.0,
163
+ "rewards/tag_count_reward": 0.3553571403026581,
164
+ "step": 11
165
+ },
166
+ {
167
+ "completion_length": 4065.3965454101562,
168
+ "epoch": 0.3356643356643357,
169
+ "grad_norm": 0.26943907141685486,
170
+ "kl": 0.0992431640625,
171
+ "learning_rate": 8.978547040132317e-06,
172
+ "loss": 0.004,
173
+ "reward": 0.35625000298023224,
174
+ "reward_std": 0.11728819366544485,
175
+ "rewards/accuracy_reward": 0.0321428575553,
176
+ "rewards/format_reward": 0.0,
177
+ "rewards/tag_count_reward": 0.3241071403026581,
178
+ "step": 12
179
+ },
180
+ {
181
+ "completion_length": 3880.08203125,
182
+ "epoch": 0.36363636363636365,
183
+ "grad_norm": 0.3461170792579651,
184
+ "kl": 0.1202392578125,
185
+ "learning_rate": 8.681980515339464e-06,
186
+ "loss": 0.0048,
187
+ "reward": 0.6062500029802322,
188
+ "reward_std": 0.22258323803544044,
189
+ "rewards/accuracy_reward": 0.2607142962515354,
190
+ "rewards/format_reward": 0.0,
191
+ "rewards/tag_count_reward": 0.3455357179045677,
192
+ "step": 13
193
+ },
194
+ {
195
+ "completion_length": 3993.267822265625,
196
+ "epoch": 0.3916083916083916,
197
+ "grad_norm": 0.32656392455101013,
198
+ "kl": 0.13916015625,
199
+ "learning_rate": 8.354769778736407e-06,
200
+ "loss": 0.0056,
201
+ "reward": 0.6285714283585548,
202
+ "reward_std": 0.29084962233901024,
203
+ "rewards/accuracy_reward": 0.29285714589059353,
204
+ "rewards/format_reward": 0.0,
205
+ "rewards/tag_count_reward": 0.33571428060531616,
206
+ "step": 14
207
+ },
208
+ {
209
+ "completion_length": 4049.6571044921875,
210
+ "epoch": 0.4195804195804196,
211
+ "grad_norm": 0.3071815073490143,
212
+ "kl": 0.1650390625,
213
+ "learning_rate": 8.00006604858821e-06,
214
+ "loss": 0.0066,
215
+ "reward": 0.5178571417927742,
216
+ "reward_std": 0.17362720984965563,
217
+ "rewards/accuracy_reward": 0.20714286155998707,
218
+ "rewards/format_reward": 0.0,
219
+ "rewards/tag_count_reward": 0.3107142820954323,
220
+ "step": 15
221
+ },
222
+ {
223
+ "completion_length": 4045.789306640625,
224
+ "epoch": 0.44755244755244755,
225
+ "grad_norm": 0.40854963660240173,
226
+ "kl": 0.2138671875,
227
+ "learning_rate": 7.621285315716991e-06,
228
+ "loss": 0.0085,
229
+ "reward": 0.5482142865657806,
230
+ "reward_std": 0.2475740760564804,
231
+ "rewards/accuracy_reward": 0.22500000521540642,
232
+ "rewards/format_reward": 0.0,
233
+ "rewards/tag_count_reward": 0.3232142925262451,
234
+ "step": 16
235
+ },
236
+ {
237
+ "completion_length": 3979.9750366210938,
238
+ "epoch": 0.4755244755244755,
239
+ "grad_norm": 0.48029232025146484,
240
+ "kl": 0.250732421875,
241
+ "learning_rate": 7.222075445642904e-06,
242
+ "loss": 0.01,
243
+ "reward": 0.5223214402794838,
244
+ "reward_std": 0.2250591404736042,
245
+ "rewards/accuracy_reward": 0.20357143506407738,
246
+ "rewards/format_reward": 0.0,
247
+ "rewards/tag_count_reward": 0.3187500014901161,
248
+ "step": 17
249
+ },
250
+ {
251
+ "completion_length": 3790.7999877929688,
252
+ "epoch": 0.5034965034965035,
253
+ "grad_norm": 0.59281986951828,
254
+ "kl": 0.26953125,
255
+ "learning_rate": 6.80628104764508e-06,
256
+ "loss": 0.0108,
257
+ "reward": 0.5714285746216774,
258
+ "reward_std": 0.20503208599984646,
259
+ "rewards/accuracy_reward": 0.21428572107106447,
260
+ "rewards/format_reward": 0.0,
261
+ "rewards/tag_count_reward": 0.3571428582072258,
262
+ "step": 18
263
+ },
264
+ {
265
+ "completion_length": 3469.2178344726562,
266
+ "epoch": 0.5314685314685315,
267
+ "grad_norm": 5.099720478057861,
268
+ "kl": 0.376953125,
269
+ "learning_rate": 6.377906449072578e-06,
270
+ "loss": 0.0151,
271
+ "reward": 0.6991071403026581,
272
+ "reward_std": 0.2952312082052231,
273
+ "rewards/accuracy_reward": 0.30000000819563866,
274
+ "rewards/format_reward": 0.0,
275
+ "rewards/tag_count_reward": 0.3991071358323097,
276
+ "step": 19
277
+ },
278
+ {
279
+ "completion_length": 3258.9036254882812,
280
+ "epoch": 0.5594405594405595,
281
+ "grad_norm": 0.476639062166214,
282
+ "kl": 0.423828125,
283
+ "learning_rate": 5.9410771314830255e-06,
284
+ "loss": 0.017,
285
+ "reward": 0.7660714238882065,
286
+ "reward_std": 0.3425598815083504,
287
+ "rewards/accuracy_reward": 0.3107142932713032,
288
+ "rewards/format_reward": 0.0,
289
+ "rewards/tag_count_reward": 0.4553571492433548,
290
+ "step": 20
291
+ },
292
+ {
293
+ "completion_length": 2936.7607421875,
294
+ "epoch": 0.5874125874125874,
295
+ "grad_norm": 0.9111078977584839,
296
+ "kl": 0.4521484375,
297
+ "learning_rate": 5.500000000000001e-06,
298
+ "loss": 0.0181,
299
+ "reward": 0.7901785671710968,
300
+ "reward_std": 0.2908039838075638,
301
+ "rewards/accuracy_reward": 0.30714286491274834,
302
+ "rewards/format_reward": 0.0,
303
+ "rewards/tag_count_reward": 0.48303571343421936,
304
+ "step": 21
305
+ },
306
+ {
307
+ "completion_length": 2737.4607543945312,
308
+ "epoch": 0.6153846153846154,
309
+ "grad_norm": 0.5476343035697937,
310
+ "kl": 0.54345703125,
311
+ "learning_rate": 5.0589228685169776e-06,
312
+ "loss": 0.0217,
313
+ "reward": 0.7124999910593033,
314
+ "reward_std": 0.22632914781570435,
315
+ "rewards/accuracy_reward": 0.22500000917352736,
316
+ "rewards/format_reward": 0.0,
317
+ "rewards/tag_count_reward": 0.48750000447034836,
318
+ "step": 22
319
+ },
320
+ {
321
+ "completion_length": 2191.2464599609375,
322
+ "epoch": 0.6433566433566433,
323
+ "grad_norm": 0.6787258386611938,
324
+ "kl": 0.5205078125,
325
+ "learning_rate": 4.622093550927423e-06,
326
+ "loss": 0.0208,
327
+ "reward": 0.8017857074737549,
328
+ "reward_std": 0.24978942424058914,
329
+ "rewards/accuracy_reward": 0.32500000670552254,
330
+ "rewards/format_reward": 0.0,
331
+ "rewards/tag_count_reward": 0.47678571939468384,
332
+ "step": 23
333
+ },
334
+ {
335
+ "completion_length": 2145.5035705566406,
336
+ "epoch": 0.6713286713286714,
337
+ "grad_norm": 0.9464374780654907,
338
+ "kl": 0.6494140625,
339
+ "learning_rate": 4.193718952354921e-06,
340
+ "loss": 0.026,
341
+ "reward": 0.6982142925262451,
342
+ "reward_std": 0.2445763349533081,
343
+ "rewards/accuracy_reward": 0.19642857555299997,
344
+ "rewards/format_reward": 0.0,
345
+ "rewards/tag_count_reward": 0.5017857104539871,
346
+ "step": 24
347
+ },
348
+ {
349
+ "completion_length": 1999.6785888671875,
350
+ "epoch": 0.6993006993006993,
351
+ "grad_norm": 0.7832499742507935,
352
+ "kl": 0.626953125,
353
+ "learning_rate": 3.777924554357096e-06,
354
+ "loss": 0.025,
355
+ "reward": 0.7687499970197678,
356
+ "reward_std": 0.2781074270606041,
357
+ "rewards/accuracy_reward": 0.2678571492433548,
358
+ "rewards/format_reward": 0.0,
359
+ "rewards/tag_count_reward": 0.500892847776413,
360
+ "step": 25
361
+ },
362
+ {
363
+ "completion_length": 2141.8035888671875,
364
+ "epoch": 0.7272727272727273,
365
+ "grad_norm": 0.8674513697624207,
366
+ "kl": 0.765625,
367
+ "learning_rate": 3.378714684283011e-06,
368
+ "loss": 0.0306,
369
+ "reward": 0.7142857015132904,
370
+ "reward_std": 0.226736880838871,
371
+ "rewards/accuracy_reward": 0.22500000149011612,
372
+ "rewards/format_reward": 0.0,
373
+ "rewards/tag_count_reward": 0.4892857298254967,
374
+ "step": 26
375
+ },
376
+ {
377
+ "completion_length": 1950.5678100585938,
378
+ "epoch": 0.7552447552447552,
379
+ "grad_norm": 0.7500644326210022,
380
+ "kl": 0.65234375,
381
+ "learning_rate": 2.9999339514117913e-06,
382
+ "loss": 0.0261,
383
+ "reward": 0.7374999821186066,
384
+ "reward_std": 0.25540266558527946,
385
+ "rewards/accuracy_reward": 0.19285714533179998,
386
+ "rewards/format_reward": 0.0,
387
+ "rewards/tag_count_reward": 0.5446428656578064,
388
+ "step": 27
389
+ },
390
+ {
391
+ "completion_length": 2502.7821655273438,
392
+ "epoch": 0.7832167832167832,
393
+ "grad_norm": 1.129561185836792,
394
+ "kl": 0.869140625,
395
+ "learning_rate": 2.645230221263596e-06,
396
+ "loss": 0.0348,
397
+ "reward": 0.5919642895460129,
398
+ "reward_std": 0.18119085393846035,
399
+ "rewards/accuracy_reward": 0.10714286030270159,
400
+ "rewards/format_reward": 0.0,
401
+ "rewards/tag_count_reward": 0.4848214387893677,
402
+ "step": 28
403
+ },
404
+ {
405
+ "completion_length": 2735.4535522460938,
406
+ "epoch": 0.8111888111888111,
407
+ "grad_norm": 0.673635721206665,
408
+ "kl": 0.8076171875,
409
+ "learning_rate": 2.3180194846605367e-06,
410
+ "loss": 0.0323,
411
+ "reward": 0.6232142895460129,
412
+ "reward_std": 0.23368354886770248,
413
+ "rewards/accuracy_reward": 0.1428571476135403,
414
+ "rewards/format_reward": 0.0,
415
+ "rewards/tag_count_reward": 0.4803571552038193,
416
+ "step": 29
417
+ },
418
+ {
419
+ "completion_length": 2827.32861328125,
420
+ "epoch": 0.8391608391608392,
421
+ "grad_norm": 1.255906343460083,
422
+ "kl": 0.8408203125,
423
+ "learning_rate": 2.021452959867684e-06,
424
+ "loss": 0.0336,
425
+ "reward": 0.5776785612106323,
426
+ "reward_std": 0.23459363728761673,
427
+ "rewards/accuracy_reward": 0.08928571734577417,
428
+ "rewards/format_reward": 0.0,
429
+ "rewards/tag_count_reward": 0.4883928596973419,
430
+ "step": 30
431
+ },
432
+ {
433
+ "completion_length": 3130.0,
434
+ "epoch": 0.8671328671328671,
435
+ "grad_norm": 0.8778608441352844,
436
+ "kl": 0.8134765625,
437
+ "learning_rate": 1.7583867446385461e-06,
438
+ "loss": 0.0325,
439
+ "reward": 0.5687500089406967,
440
+ "reward_std": 0.21473033353686333,
441
+ "rewards/accuracy_reward": 0.11785714444704354,
442
+ "rewards/format_reward": 0.0,
443
+ "rewards/tag_count_reward": 0.4508928656578064,
444
+ "step": 31
445
+ },
446
+ {
447
+ "completion_length": 3108.8750610351562,
448
+ "epoch": 0.8951048951048951,
449
+ "grad_norm": 0.703495979309082,
450
+ "kl": 0.6640625,
451
+ "learning_rate": 1.531354310432403e-06,
452
+ "loss": 0.0266,
453
+ "reward": 0.6116071492433548,
454
+ "reward_std": 0.24049308896064758,
455
+ "rewards/accuracy_reward": 0.15000000223517418,
456
+ "rewards/format_reward": 0.0,
457
+ "rewards/tag_count_reward": 0.4616071507334709,
458
+ "step": 32
459
+ },
460
+ {
461
+ "completion_length": 3385.7571411132812,
462
+ "epoch": 0.9230769230769231,
463
+ "grad_norm": 0.9375730752944946,
464
+ "kl": 0.7919921875,
465
+ "learning_rate": 1.3425421036992098e-06,
466
+ "loss": 0.0316,
467
+ "reward": 0.4883928596973419,
468
+ "reward_std": 0.172358563169837,
469
+ "rewards/accuracy_reward": 0.06428571604192257,
470
+ "rewards/format_reward": 0.0,
471
+ "rewards/tag_count_reward": 0.4241071343421936,
472
+ "step": 33
473
+ },
474
+ {
475
+ "completion_length": 3170.0820922851562,
476
+ "epoch": 0.951048951048951,
477
+ "grad_norm": 1.1337316036224365,
478
+ "kl": 0.58447265625,
479
+ "learning_rate": 1.1937684892050606e-06,
480
+ "loss": 0.0234,
481
+ "reward": 0.6321428567171097,
482
+ "reward_std": 0.288823913782835,
483
+ "rewards/accuracy_reward": 0.17857143469154835,
484
+ "rewards/format_reward": 0.0,
485
+ "rewards/tag_count_reward": 0.4535714313387871,
486
+ "step": 34
487
+ },
488
+ {
489
+ "completion_length": 3635.4571533203125,
490
+ "epoch": 0.9790209790209791,
491
+ "grad_norm": 0.7671982049942017,
492
+ "kl": 0.62109375,
493
+ "learning_rate": 1.0864662381854632e-06,
494
+ "loss": 0.0248,
495
+ "reward": 0.557142861187458,
496
+ "reward_std": 0.18924793601036072,
497
+ "rewards/accuracy_reward": 0.14642857760190964,
498
+ "rewards/format_reward": 0.0,
499
+ "rewards/tag_count_reward": 0.410714291036129,
500
+ "step": 35
501
+ },
502
+ {
503
+ "completion_length": 3688.6334635416665,
504
+ "epoch": 1.0,
505
+ "grad_norm": 0.7671982049942017,
506
+ "kl": 0.62109375,
507
+ "learning_rate": 1.0216687299751146e-06,
508
+ "loss": 0.0186,
509
+ "reward": 0.5226190686225891,
510
+ "reward_std": 0.2011700620253881,
511
+ "rewards/accuracy_reward": 0.10952381292978923,
512
+ "rewards/format_reward": 0.0,
513
+ "rewards/tag_count_reward": 0.41309523582458496,
514
+ "step": 36
515
+ },
516
+ {
517
+ "epoch": 1.0,
518
+ "step": 36,
519
+ "total_flos": 0.0,
520
+ "train_loss": 0.014487237919349916,
521
+ "train_runtime": 13938.4252,
522
+ "train_samples_per_second": 0.072,
523
+ "train_steps_per_second": 0.003
524
+ }
525
+ ],
526
+ "logging_steps": 1,
527
+ "max_steps": 36,
528
+ "num_input_tokens_seen": 0,
529
+ "num_train_epochs": 1,
530
+ "save_steps": 5,
531
+ "stateful_callbacks": {
532
+ "TrainerControl": {
533
+ "args": {
534
+ "should_epoch_stop": false,
535
+ "should_evaluate": false,
536
+ "should_log": false,
537
+ "should_save": true,
538
+ "should_training_stop": true
539
+ },
540
+ "attributes": {}
541
+ }
542
+ },
543
+ "total_flos": 0.0,
544
+ "train_batch_size": 10,
545
+ "trial_name": null,
546
+ "trial_params": null
547
+ }