Rakancorle1 commited on
Commit
b825cc4
·
verified ·
1 Parent(s): 9333060

End of training

Browse files
Files changed (5) hide show
  1. README.md +2 -1
  2. all_results.json +8 -0
  3. train_results.json +8 -0
  4. trainer_state.json +603 -0
  5. training_loss.png +0 -0
README.md CHANGED
@@ -4,6 +4,7 @@ license: apache-2.0
4
  base_model: Qwen/Qwen3-4B-Instruct-2507
5
  tags:
6
  - llama-factory
 
7
  - generated_from_trainer
8
  model-index:
9
  - name: Qwen3-4B-Instruct_0910_LODO_gitlab_full
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  # Qwen3-4B-Instruct_0910_LODO_gitlab_full
17
 
18
- This model is a fine-tuned version of [Qwen/Qwen3-4B-Instruct-2507](https://huggingface.co/Qwen/Qwen3-4B-Instruct-2507) on an unknown dataset.
19
 
20
  ## Model description
21
 
 
4
  base_model: Qwen/Qwen3-4B-Instruct-2507
5
  tags:
6
  - llama-factory
7
+ - full
8
  - generated_from_trainer
9
  model-index:
10
  - name: Qwen3-4B-Instruct_0910_LODO_gitlab_full
 
16
 
17
  # Qwen3-4B-Instruct_0910_LODO_gitlab_full
18
 
19
+ This model is a fine-tuned version of [Qwen/Qwen3-4B-Instruct-2507](https://huggingface.co/Qwen/Qwen3-4B-Instruct-2507) on the Policy_Traj_LODO_gitlab dataset.
20
 
21
  ## Model description
22
 
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "total_flos": 33113178439680.0,
4
+ "train_loss": 0.042386234274015444,
5
+ "train_runtime": 4174.8779,
6
+ "train_samples_per_second": 24.521,
7
+ "train_steps_per_second": 0.192
8
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "total_flos": 33113178439680.0,
4
+ "train_loss": 0.042386234274015444,
5
+ "train_runtime": 4174.8779,
6
+ "train_samples_per_second": 24.521,
7
+ "train_steps_per_second": 0.192
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,603 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
+ "eval_steps": 500,
7
+ "global_step": 801,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.03750586029067042,
14
+ "grad_norm": 34.85824610814536,
15
+ "learning_rate": 1.8e-05,
16
+ "loss": 0.3557,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.07501172058134084,
21
+ "grad_norm": 7.1481602122450285,
22
+ "learning_rate": 3.8e-05,
23
+ "loss": 0.1487,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.11251758087201125,
28
+ "grad_norm": 8.614036123620082,
29
+ "learning_rate": 4.999672209164081e-05,
30
+ "loss": 0.1396,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.15002344116268168,
35
+ "grad_norm": 10.10705100519714,
36
+ "learning_rate": 4.995985549356568e-05,
37
+ "loss": 0.0895,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.1875293014533521,
42
+ "grad_norm": 5.6723632225651555,
43
+ "learning_rate": 4.988208552916535e-05,
44
+ "loss": 0.0733,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.2250351617440225,
49
+ "grad_norm": 3.5917593429678067,
50
+ "learning_rate": 4.976353964522509e-05,
51
+ "loss": 0.2835,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.26254102203469293,
56
+ "grad_norm": 2.7853884780818023,
57
+ "learning_rate": 4.960441211072686e-05,
58
+ "loss": 0.0559,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.30004688232536336,
63
+ "grad_norm": 1.0618750438052633,
64
+ "learning_rate": 4.940496369848795e-05,
65
+ "loss": 0.0562,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.33755274261603374,
70
+ "grad_norm": 0.21567312878465145,
71
+ "learning_rate": 4.916552125781528e-05,
72
+ "loss": 0.0545,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.3750586029067042,
77
+ "grad_norm": 6.591249771359263,
78
+ "learning_rate": 4.8886477178875826e-05,
79
+ "loss": 0.0665,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.4125644631973746,
84
+ "grad_norm": 0.2514334496388468,
85
+ "learning_rate": 4.856828874966086e-05,
86
+ "loss": 0.0577,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 0.450070323488045,
91
+ "grad_norm": 2.19013548059961,
92
+ "learning_rate": 4.821147740659794e-05,
93
+ "loss": 0.0424,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 0.4875761837787154,
98
+ "grad_norm": 1.8200457984558476,
99
+ "learning_rate": 4.781662788003851e-05,
100
+ "loss": 0.0435,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 0.5250820440693859,
105
+ "grad_norm": 1.788433839164408,
106
+ "learning_rate": 4.738438723602154e-05,
107
+ "loss": 0.0459,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 0.5625879043600562,
112
+ "grad_norm": 3.4278131423512943,
113
+ "learning_rate": 4.69154638158837e-05,
114
+ "loss": 0.0416,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 0.6000937646507267,
119
+ "grad_norm": 0.26651980565241046,
120
+ "learning_rate": 4.641062607545347e-05,
121
+ "loss": 0.0422,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 0.6375996249413971,
126
+ "grad_norm": 1.6317479332317566,
127
+ "learning_rate": 4.587070132573178e-05,
128
+ "loss": 0.0388,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 0.6751054852320675,
133
+ "grad_norm": 1.6520649014609303,
134
+ "learning_rate": 4.529657437712276e-05,
135
+ "loss": 0.0334,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 0.712611345522738,
140
+ "grad_norm": 0.15277860741515586,
141
+ "learning_rate": 4.4689186089436366e-05,
142
+ "loss": 0.0412,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 0.7501172058134083,
147
+ "grad_norm": 0.9030791837609309,
148
+ "learning_rate": 4.404953183003916e-05,
149
+ "loss": 0.036,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 0.7876230661040787,
154
+ "grad_norm": 1.9866732218980805,
155
+ "learning_rate": 4.337865984268001e-05,
156
+ "loss": 0.0393,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 0.8251289263947492,
161
+ "grad_norm": 0.23596674543489504,
162
+ "learning_rate": 4.267766952966369e-05,
163
+ "loss": 0.0385,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 0.8626347866854196,
168
+ "grad_norm": 2.7613828112655905,
169
+ "learning_rate": 4.194770965018758e-05,
170
+ "loss": 0.0411,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 0.90014064697609,
175
+ "grad_norm": 0.9440762280964653,
176
+ "learning_rate": 4.118997643779401e-05,
177
+ "loss": 0.0346,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 0.9376465072667605,
182
+ "grad_norm": 0.41009567832530625,
183
+ "learning_rate": 4.0405711640023186e-05,
184
+ "loss": 0.0344,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 0.9751523675574308,
189
+ "grad_norm": 0.32482405462507263,
190
+ "learning_rate": 3.9596200483479385e-05,
191
+ "loss": 0.0324,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 1.0112517580872011,
196
+ "grad_norm": 0.806991725526476,
197
+ "learning_rate": 3.876276956764509e-05,
198
+ "loss": 0.0346,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 1.0487576183778715,
203
+ "grad_norm": 1.1338976875621802,
204
+ "learning_rate": 3.7906784690894645e-05,
205
+ "loss": 0.0424,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 1.086263478668542,
210
+ "grad_norm": 1.500478633231241,
211
+ "learning_rate": 3.702964861227013e-05,
212
+ "loss": 0.0346,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 1.1237693389592125,
217
+ "grad_norm": 1.4409676013917596,
218
+ "learning_rate": 3.613279875268731e-05,
219
+ "loss": 0.0354,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 1.1612751992498829,
224
+ "grad_norm": 0.2422476126318276,
225
+ "learning_rate": 3.521770483933891e-05,
226
+ "loss": 0.034,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 1.1987810595405533,
231
+ "grad_norm": 1.5840918548478307,
232
+ "learning_rate": 3.4285866497155414e-05,
233
+ "loss": 0.0459,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 1.2362869198312236,
238
+ "grad_norm": 0.37403459055017013,
239
+ "learning_rate": 3.333881079127052e-05,
240
+ "loss": 0.0292,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 1.273792780121894,
245
+ "grad_norm": 0.7618214390592176,
246
+ "learning_rate": 3.2378089724518465e-05,
247
+ "loss": 0.0279,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 1.3112986404125644,
252
+ "grad_norm": 0.6779062975577348,
253
+ "learning_rate": 3.1405277694064305e-05,
254
+ "loss": 0.0317,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 1.3488045007032348,
259
+ "grad_norm": 0.6053365729294653,
260
+ "learning_rate": 3.0421968911335196e-05,
261
+ "loss": 0.0332,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 1.3863103609939054,
266
+ "grad_norm": 0.5023346538140172,
267
+ "learning_rate": 2.9429774789480575e-05,
268
+ "loss": 0.0258,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 1.4238162212845757,
273
+ "grad_norm": 0.1512518290372575,
274
+ "learning_rate": 2.843032130264289e-05,
275
+ "loss": 0.0254,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 1.4613220815752461,
280
+ "grad_norm": 0.4256415803778755,
281
+ "learning_rate": 2.7425246321366203e-05,
282
+ "loss": 0.0289,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 1.4988279418659165,
287
+ "grad_norm": 2.0526292381935587,
288
+ "learning_rate": 2.6416196928509408e-05,
289
+ "loss": 0.0293,
290
+ "step": 400
291
+ },
292
+ {
293
+ "epoch": 1.5363338021565869,
294
+ "grad_norm": 2.199901935419105,
295
+ "learning_rate": 2.540482672006254e-05,
296
+ "loss": 0.0324,
297
+ "step": 410
298
+ },
299
+ {
300
+ "epoch": 1.5738396624472575,
301
+ "grad_norm": 0.6587554924832156,
302
+ "learning_rate": 2.4392793095289677e-05,
303
+ "loss": 0.0318,
304
+ "step": 420
305
+ },
306
+ {
307
+ "epoch": 1.6113455227379276,
308
+ "grad_norm": 0.11953774619854174,
309
+ "learning_rate": 2.338175454063911e-05,
310
+ "loss": 0.027,
311
+ "step": 430
312
+ },
313
+ {
314
+ "epoch": 1.6488513830285982,
315
+ "grad_norm": 1.8366328720607281,
316
+ "learning_rate": 2.2373367911871904e-05,
317
+ "loss": 0.0271,
318
+ "step": 440
319
+ },
320
+ {
321
+ "epoch": 1.6863572433192686,
322
+ "grad_norm": 0.16134023570988165,
323
+ "learning_rate": 2.136928571886275e-05,
324
+ "loss": 0.0309,
325
+ "step": 450
326
+ },
327
+ {
328
+ "epoch": 1.723863103609939,
329
+ "grad_norm": 0.5782347760396018,
330
+ "learning_rate": 2.03711534175227e-05,
331
+ "loss": 0.0276,
332
+ "step": 460
333
+ },
334
+ {
335
+ "epoch": 1.7613689639006096,
336
+ "grad_norm": 0.7600638604790938,
337
+ "learning_rate": 1.9380606713281775e-05,
338
+ "loss": 0.0284,
339
+ "step": 470
340
+ },
341
+ {
342
+ "epoch": 1.7988748241912798,
343
+ "grad_norm": 0.28905055512217726,
344
+ "learning_rate": 1.8399268880550174e-05,
345
+ "loss": 0.0289,
346
+ "step": 480
347
+ },
348
+ {
349
+ "epoch": 1.8363806844819504,
350
+ "grad_norm": 0.47269674339106393,
351
+ "learning_rate": 1.7428748102551237e-05,
352
+ "loss": 0.0254,
353
+ "step": 490
354
+ },
355
+ {
356
+ "epoch": 1.8738865447726207,
357
+ "grad_norm": 0.1126048083243114,
358
+ "learning_rate": 1.6470634835885097e-05,
359
+ "loss": 0.0329,
360
+ "step": 500
361
+ },
362
+ {
363
+ "epoch": 1.9113924050632911,
364
+ "grad_norm": 0.4869034202302143,
365
+ "learning_rate": 1.552649920414233e-05,
366
+ "loss": 0.0294,
367
+ "step": 510
368
+ },
369
+ {
370
+ "epoch": 1.9488982653539617,
371
+ "grad_norm": 0.1048044631528693,
372
+ "learning_rate": 1.4597888424838518e-05,
373
+ "loss": 0.03,
374
+ "step": 520
375
+ },
376
+ {
377
+ "epoch": 1.9864041256446319,
378
+ "grad_norm": 0.257084202132171,
379
+ "learning_rate": 1.368632427388653e-05,
380
+ "loss": 0.0263,
381
+ "step": 530
382
+ },
383
+ {
384
+ "epoch": 2.0225035161744023,
385
+ "grad_norm": 0.41612935348774654,
386
+ "learning_rate": 1.2793300591761742e-05,
387
+ "loss": 0.0255,
388
+ "step": 540
389
+ },
390
+ {
391
+ "epoch": 2.060009376465073,
392
+ "grad_norm": 0.1472784892712205,
393
+ "learning_rate": 1.1920280835446748e-05,
394
+ "loss": 0.019,
395
+ "step": 550
396
+ },
397
+ {
398
+ "epoch": 2.097515236755743,
399
+ "grad_norm": 0.7021607086288816,
400
+ "learning_rate": 1.1068695680167664e-05,
401
+ "loss": 0.0259,
402
+ "step": 560
403
+ },
404
+ {
405
+ "epoch": 2.1350210970464136,
406
+ "grad_norm": 1.0121270104703033,
407
+ "learning_rate": 1.0239940674851941e-05,
408
+ "loss": 0.0228,
409
+ "step": 570
410
+ },
411
+ {
412
+ "epoch": 2.172526957337084,
413
+ "grad_norm": 0.8364824257153406,
414
+ "learning_rate": 9.43537395515003e-06,
415
+ "loss": 0.0233,
416
+ "step": 580
417
+ },
418
+ {
419
+ "epoch": 2.2100328176277544,
420
+ "grad_norm": 0.29831710959245356,
421
+ "learning_rate": 8.656314017768693e-06,
422
+ "loss": 0.0213,
423
+ "step": 590
424
+ },
425
+ {
426
+ "epoch": 2.247538677918425,
427
+ "grad_norm": 0.21598369420228658,
428
+ "learning_rate": 7.904037559763162e-06,
429
+ "loss": 0.0228,
430
+ "step": 600
431
+ },
432
+ {
433
+ "epoch": 2.285044538209095,
434
+ "grad_norm": 0.5662760774283471,
435
+ "learning_rate": 7.179777386329276e-06,
436
+ "loss": 0.0184,
437
+ "step": 610
438
+ },
439
+ {
440
+ "epoch": 2.3225503984997657,
441
+ "grad_norm": 0.10841392927498035,
442
+ "learning_rate": 6.484720390524007e-06,
443
+ "loss": 0.0223,
444
+ "step": 620
445
+ },
446
+ {
447
+ "epoch": 2.360056258790436,
448
+ "grad_norm": 0.2351999089031456,
449
+ "learning_rate": 5.820005608225346e-06,
450
+ "loss": 0.0221,
451
+ "step": 630
452
+ },
453
+ {
454
+ "epoch": 2.3975621190811065,
455
+ "grad_norm": 0.27088765350399413,
456
+ "learning_rate": 5.186722351518822e-06,
457
+ "loss": 0.0276,
458
+ "step": 640
459
+ },
460
+ {
461
+ "epoch": 2.4350679793717767,
462
+ "grad_norm": 0.08323548651396794,
463
+ "learning_rate": 4.585908423569724e-06,
464
+ "loss": 0.0209,
465
+ "step": 650
466
+ },
467
+ {
468
+ "epoch": 2.4725738396624473,
469
+ "grad_norm": 0.21245905268041415,
470
+ "learning_rate": 4.0185484179064425e-06,
471
+ "loss": 0.0233,
472
+ "step": 660
473
+ },
474
+ {
475
+ "epoch": 2.510079699953118,
476
+ "grad_norm": 0.5537577775149638,
477
+ "learning_rate": 3.4855721049018688e-06,
478
+ "loss": 0.0229,
479
+ "step": 670
480
+ },
481
+ {
482
+ "epoch": 2.547585560243788,
483
+ "grad_norm": 0.7847967042002256,
484
+ "learning_rate": 2.98785290809723e-06,
485
+ "loss": 0.0253,
486
+ "step": 680
487
+ },
488
+ {
489
+ "epoch": 2.5850914205344586,
490
+ "grad_norm": 0.507105447694394,
491
+ "learning_rate": 2.52620647286512e-06,
492
+ "loss": 0.0206,
493
+ "step": 690
494
+ },
495
+ {
496
+ "epoch": 2.6225972808251288,
497
+ "grad_norm": 0.24719979538798334,
498
+ "learning_rate": 2.101389329757478e-06,
499
+ "loss": 0.0172,
500
+ "step": 700
501
+ },
502
+ {
503
+ "epoch": 2.6601031411157994,
504
+ "grad_norm": 0.481232043094042,
505
+ "learning_rate": 1.7140976547289438e-06,
506
+ "loss": 0.0197,
507
+ "step": 710
508
+ },
509
+ {
510
+ "epoch": 2.6976090014064695,
511
+ "grad_norm": 0.525987532669796,
512
+ "learning_rate": 1.3649661282672476e-06,
513
+ "loss": 0.0202,
514
+ "step": 720
515
+ },
516
+ {
517
+ "epoch": 2.73511486169714,
518
+ "grad_norm": 0.12411824566724937,
519
+ "learning_rate": 1.0545668953003241e-06,
520
+ "loss": 0.0215,
521
+ "step": 730
522
+ },
523
+ {
524
+ "epoch": 2.7726207219878107,
525
+ "grad_norm": 0.2216514712018457,
526
+ "learning_rate": 7.834086275845587e-07,
527
+ "loss": 0.0182,
528
+ "step": 740
529
+ },
530
+ {
531
+ "epoch": 2.810126582278481,
532
+ "grad_norm": 0.220985617144204,
533
+ "learning_rate": 5.519356901107358e-07,
534
+ "loss": 0.0244,
535
+ "step": 750
536
+ },
537
+ {
538
+ "epoch": 2.8476324425691515,
539
+ "grad_norm": 0.34207273950972444,
540
+ "learning_rate": 3.605274128937464e-07,
541
+ "loss": 0.0233,
542
+ "step": 760
543
+ },
544
+ {
545
+ "epoch": 2.885138302859822,
546
+ "grad_norm": 0.11697799335786845,
547
+ "learning_rate": 2.094974693393731e-07,
548
+ "loss": 0.019,
549
+ "step": 770
550
+ },
551
+ {
552
+ "epoch": 2.9226441631504922,
553
+ "grad_norm": 0.3157831029546487,
554
+ "learning_rate": 9.90933622069562e-08,
555
+ "loss": 0.0177,
556
+ "step": 780
557
+ },
558
+ {
559
+ "epoch": 2.960150023441163,
560
+ "grad_norm": 0.16740144227166562,
561
+ "learning_rate": 2.9496018010233274e-08,
562
+ "loss": 0.022,
563
+ "step": 790
564
+ },
565
+ {
566
+ "epoch": 2.997655883731833,
567
+ "grad_norm": 0.33884453903018824,
568
+ "learning_rate": 8.194905210923143e-10,
569
+ "loss": 0.0232,
570
+ "step": 800
571
+ },
572
+ {
573
+ "epoch": 3.0,
574
+ "step": 801,
575
+ "total_flos": 33113178439680.0,
576
+ "train_loss": 0.042386234274015444,
577
+ "train_runtime": 4174.8779,
578
+ "train_samples_per_second": 24.521,
579
+ "train_steps_per_second": 0.192
580
+ }
581
+ ],
582
+ "logging_steps": 10,
583
+ "max_steps": 801,
584
+ "num_input_tokens_seen": 0,
585
+ "num_train_epochs": 3,
586
+ "save_steps": 100,
587
+ "stateful_callbacks": {
588
+ "TrainerControl": {
589
+ "args": {
590
+ "should_epoch_stop": false,
591
+ "should_evaluate": false,
592
+ "should_log": false,
593
+ "should_save": true,
594
+ "should_training_stop": true
595
+ },
596
+ "attributes": {}
597
+ }
598
+ },
599
+ "total_flos": 33113178439680.0,
600
+ "train_batch_size": 4,
601
+ "trial_name": null,
602
+ "trial_params": null
603
+ }
training_loss.png ADDED