maanasharma5 commited on
Commit
a4eb780
·
verified ·
1 Parent(s): 45cb0e5

Upload folder using huggingface_hub

Browse files
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:39118f7368d7b390b9e48c4e9a09b46e6fcdee5e50be90f61bfd90db2c637fa8
3
  size 25172088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:793c2c9d0135d56ef7de4863551a40cc9d5c7b029605658aa0537f5f355f5cc2
3
  size 25172088
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6b60f982069b7054a7c3e41818e87b3ee2a8d276881f8d520b1f0c0d70d36309
3
  size 50372538
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2802faa1f7ee79dc302ab9df00e418e656ecd332ea2c97989901116bd2f2085
3
  size 50372538
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e5994634e90ce1b45d012a45568063be05fea876e791cd66b48a4efc924164b2
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:feded292b067a53a2aeb0e2a23dd8fd5fd080c27efc2767fe4b430da7e5f7d6f
3
  size 14244
trainer_state.json CHANGED
@@ -12,2184 +12,2184 @@
12
  "epoch": 0.0032,
13
  "grad_norm": 10.0,
14
  "learning_rate": 2.132196162046908e-06,
15
- "loss": 22.4733,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.0064,
20
- "grad_norm": 9.999999046325684,
21
  "learning_rate": 4.264392324093816e-06,
22
- "loss": 18.8042,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.0096,
27
- "grad_norm": 10.000001907348633,
28
  "learning_rate": 6.396588486140726e-06,
29
- "loss": 19.6935,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.0128,
34
- "grad_norm": 9.999999046325684,
35
  "learning_rate": 8.528784648187633e-06,
36
- "loss": 19.4804,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.016,
41
- "grad_norm": 10.000000953674316,
42
  "learning_rate": 1.0660980810234541e-05,
43
- "loss": 17.4569,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.0192,
48
  "grad_norm": 10.0,
49
  "learning_rate": 1.2793176972281452e-05,
50
- "loss": 18.3143,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.0224,
55
  "grad_norm": 10.0,
56
  "learning_rate": 1.4925373134328357e-05,
57
- "loss": 16.6743,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.0256,
62
- "grad_norm": 9.999999046325684,
63
  "learning_rate": 1.7057569296375266e-05,
64
- "loss": 14.7713,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.0288,
69
- "grad_norm": 9.999999046325684,
70
  "learning_rate": 1.9189765458422178e-05,
71
- "loss": 14.883,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.032,
76
  "grad_norm": 9.999999046325684,
77
  "learning_rate": 2.1321961620469083e-05,
78
- "loss": 14.1393,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.0352,
83
- "grad_norm": 9.999999046325684,
84
  "learning_rate": 2.345415778251599e-05,
85
- "loss": 11.5208,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.0384,
90
- "grad_norm": 10.0,
91
  "learning_rate": 2.5586353944562904e-05,
92
- "loss": 12.865,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.0416,
97
  "grad_norm": 10.0,
98
  "learning_rate": 2.771855010660981e-05,
99
- "loss": 10.3148,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.0448,
104
- "grad_norm": 9.999998092651367,
105
  "learning_rate": 2.9850746268656714e-05,
106
- "loss": 9.7127,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.048,
111
- "grad_norm": 9.999998092651367,
112
  "learning_rate": 3.1982942430703626e-05,
113
- "loss": 8.3006,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.0512,
118
  "grad_norm": 9.999999046325684,
119
  "learning_rate": 3.411513859275053e-05,
120
- "loss": 7.292,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.0544,
125
- "grad_norm": 9.999999046325684,
126
  "learning_rate": 3.624733475479744e-05,
127
- "loss": 6.0268,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.0576,
132
- "grad_norm": 9.999998092651367,
133
  "learning_rate": 3.8379530916844355e-05,
134
- "loss": 5.7297,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.0608,
139
- "grad_norm": 10.0,
140
  "learning_rate": 4.051172707889126e-05,
141
- "loss": 4.6862,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.064,
146
- "grad_norm": 9.999999046325684,
147
  "learning_rate": 4.2643923240938166e-05,
148
- "loss": 4.0643,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.0672,
153
- "grad_norm": 9.999998092651367,
154
  "learning_rate": 4.477611940298508e-05,
155
- "loss": 3.3502,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.0704,
160
- "grad_norm": 9.999998092651367,
161
  "learning_rate": 4.690831556503198e-05,
162
- "loss": 2.6549,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.0736,
167
- "grad_norm": 8.796483039855957,
168
  "learning_rate": 4.904051172707889e-05,
169
- "loss": 2.883,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.0768,
174
- "grad_norm": 8.624096870422363,
175
  "learning_rate": 5.117270788912581e-05,
176
- "loss": 2.3625,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.08,
181
- "grad_norm": 4.12373161315918,
182
  "learning_rate": 5.330490405117271e-05,
183
- "loss": 2.0516,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.0832,
188
- "grad_norm": 6.344674110412598,
189
  "learning_rate": 5.543710021321962e-05,
190
- "loss": 1.8814,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.0864,
195
- "grad_norm": 5.940646171569824,
196
  "learning_rate": 5.756929637526652e-05,
197
- "loss": 1.8997,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 0.0896,
202
- "grad_norm": 4.136787414550781,
203
  "learning_rate": 5.970149253731343e-05,
204
- "loss": 1.6641,
205
  "step": 280
206
  },
207
  {
208
  "epoch": 0.0928,
209
- "grad_norm": 3.336697578430176,
210
  "learning_rate": 6.183368869936035e-05,
211
- "loss": 1.5741,
212
  "step": 290
213
  },
214
  {
215
  "epoch": 0.096,
216
- "grad_norm": 4.003772735595703,
217
  "learning_rate": 6.396588486140725e-05,
218
- "loss": 1.4759,
219
  "step": 300
220
  },
221
  {
222
  "epoch": 0.0992,
223
- "grad_norm": 5.183838367462158,
224
  "learning_rate": 6.609808102345416e-05,
225
- "loss": 1.5434,
226
  "step": 310
227
  },
228
  {
229
  "epoch": 0.1024,
230
- "grad_norm": 4.6075849533081055,
231
  "learning_rate": 6.823027718550106e-05,
232
- "loss": 1.5185,
233
  "step": 320
234
  },
235
  {
236
  "epoch": 0.1056,
237
- "grad_norm": 4.707767486572266,
238
  "learning_rate": 7.036247334754798e-05,
239
- "loss": 1.3706,
240
  "step": 330
241
  },
242
  {
243
  "epoch": 0.1088,
244
- "grad_norm": 3.7312469482421875,
245
  "learning_rate": 7.249466950959489e-05,
246
- "loss": 1.3455,
247
  "step": 340
248
  },
249
  {
250
  "epoch": 0.112,
251
- "grad_norm": 3.605818033218384,
252
  "learning_rate": 7.46268656716418e-05,
253
- "loss": 1.2145,
254
  "step": 350
255
  },
256
  {
257
  "epoch": 0.1152,
258
- "grad_norm": 4.285920143127441,
259
  "learning_rate": 7.675906183368871e-05,
260
- "loss": 1.303,
261
  "step": 360
262
  },
263
  {
264
  "epoch": 0.1184,
265
- "grad_norm": 3.215698480606079,
266
  "learning_rate": 7.889125799573562e-05,
267
- "loss": 1.168,
268
  "step": 370
269
  },
270
  {
271
  "epoch": 0.1216,
272
- "grad_norm": 4.043213844299316,
273
  "learning_rate": 8.102345415778252e-05,
274
- "loss": 1.0962,
275
  "step": 380
276
  },
277
  {
278
  "epoch": 0.1248,
279
- "grad_norm": 4.1487555503845215,
280
  "learning_rate": 8.315565031982943e-05,
281
- "loss": 1.1853,
282
  "step": 390
283
  },
284
  {
285
  "epoch": 0.128,
286
- "grad_norm": 3.942502498626709,
287
  "learning_rate": 8.528784648187633e-05,
288
- "loss": 1.0418,
289
  "step": 400
290
  },
291
  {
292
  "epoch": 0.1312,
293
- "grad_norm": 4.188830852508545,
294
  "learning_rate": 8.742004264392325e-05,
295
- "loss": 1.0992,
296
  "step": 410
297
  },
298
  {
299
  "epoch": 0.1344,
300
- "grad_norm": 4.130038738250732,
301
  "learning_rate": 8.955223880597016e-05,
302
- "loss": 0.976,
303
  "step": 420
304
  },
305
  {
306
  "epoch": 0.1376,
307
- "grad_norm": 3.643944501876831,
308
  "learning_rate": 9.168443496801706e-05,
309
- "loss": 1.0592,
310
  "step": 430
311
  },
312
  {
313
  "epoch": 0.1408,
314
- "grad_norm": 3.760075092315674,
315
  "learning_rate": 9.381663113006397e-05,
316
- "loss": 1.0115,
317
  "step": 440
318
  },
319
  {
320
  "epoch": 0.144,
321
- "grad_norm": 2.4692914485931396,
322
  "learning_rate": 9.594882729211087e-05,
323
- "loss": 1.0096,
324
  "step": 450
325
  },
326
  {
327
  "epoch": 0.1472,
328
- "grad_norm": 3.1593716144561768,
329
  "learning_rate": 9.808102345415778e-05,
330
- "loss": 0.9181,
331
  "step": 460
332
  },
333
  {
334
  "epoch": 0.1504,
335
- "grad_norm": 3.6118807792663574,
336
  "learning_rate": 9.998877161464182e-05,
337
- "loss": 0.921,
338
  "step": 470
339
  },
340
  {
341
  "epoch": 0.1536,
342
- "grad_norm": 3.3329732418060303,
343
  "learning_rate": 9.987648776105997e-05,
344
- "loss": 0.8714,
345
  "step": 480
346
  },
347
  {
348
  "epoch": 0.1568,
349
- "grad_norm": 2.9491333961486816,
350
  "learning_rate": 9.97642039074781e-05,
351
- "loss": 0.8553,
352
  "step": 490
353
  },
354
  {
355
  "epoch": 0.16,
356
- "grad_norm": 3.0345425605773926,
357
  "learning_rate": 9.965192005389625e-05,
358
- "loss": 0.9232,
359
  "step": 500
360
  },
361
  {
362
  "epoch": 0.1632,
363
- "grad_norm": 3.3396542072296143,
364
  "learning_rate": 9.95396362003144e-05,
365
- "loss": 0.8112,
366
  "step": 510
367
  },
368
  {
369
  "epoch": 0.1664,
370
- "grad_norm": 3.1262471675872803,
371
  "learning_rate": 9.942735234673256e-05,
372
- "loss": 0.8252,
373
  "step": 520
374
  },
375
  {
376
  "epoch": 0.1696,
377
- "grad_norm": 2.431586742401123,
378
  "learning_rate": 9.931506849315069e-05,
379
- "loss": 0.7863,
380
  "step": 530
381
  },
382
  {
383
  "epoch": 0.1728,
384
- "grad_norm": 3.4569954872131348,
385
  "learning_rate": 9.920278463956883e-05,
386
- "loss": 0.7916,
387
  "step": 540
388
  },
389
  {
390
  "epoch": 0.176,
391
- "grad_norm": 3.2024927139282227,
392
  "learning_rate": 9.909050078598698e-05,
393
- "loss": 0.8355,
394
  "step": 550
395
  },
396
  {
397
  "epoch": 0.1792,
398
- "grad_norm": 3.4040327072143555,
399
  "learning_rate": 9.897821693240512e-05,
400
- "loss": 0.8548,
401
  "step": 560
402
  },
403
  {
404
  "epoch": 0.1824,
405
- "grad_norm": 2.5302040576934814,
406
  "learning_rate": 9.886593307882327e-05,
407
- "loss": 0.7237,
408
  "step": 570
409
  },
410
  {
411
  "epoch": 0.1856,
412
- "grad_norm": 3.6245014667510986,
413
  "learning_rate": 9.875364922524142e-05,
414
- "loss": 0.7395,
415
  "step": 580
416
  },
417
  {
418
  "epoch": 0.1888,
419
- "grad_norm": 2.6966214179992676,
420
  "learning_rate": 9.864136537165956e-05,
421
- "loss": 0.7004,
422
  "step": 590
423
  },
424
  {
425
  "epoch": 0.192,
426
- "grad_norm": 3.207789421081543,
427
  "learning_rate": 9.852908151807771e-05,
428
- "loss": 0.7419,
429
  "step": 600
430
  },
431
  {
432
  "epoch": 0.1952,
433
- "grad_norm": 3.4613256454467773,
434
  "learning_rate": 9.841679766449586e-05,
435
- "loss": 0.7264,
436
  "step": 610
437
  },
438
  {
439
  "epoch": 0.1984,
440
- "grad_norm": 3.311279058456421,
441
  "learning_rate": 9.8304513810914e-05,
442
- "loss": 0.6852,
443
  "step": 620
444
  },
445
  {
446
  "epoch": 0.2016,
447
- "grad_norm": 2.7546231746673584,
448
  "learning_rate": 9.819222995733213e-05,
449
- "loss": 0.6922,
450
  "step": 630
451
  },
452
  {
453
  "epoch": 0.2048,
454
- "grad_norm": 3.5927953720092773,
455
  "learning_rate": 9.807994610375028e-05,
456
- "loss": 0.7391,
457
  "step": 640
458
  },
459
  {
460
  "epoch": 0.208,
461
- "grad_norm": 2.975539207458496,
462
  "learning_rate": 9.796766225016843e-05,
463
- "loss": 0.699,
464
  "step": 650
465
  },
466
  {
467
  "epoch": 0.2112,
468
- "grad_norm": 3.653235673904419,
469
  "learning_rate": 9.785537839658657e-05,
470
- "loss": 0.6669,
471
  "step": 660
472
  },
473
  {
474
  "epoch": 0.2144,
475
- "grad_norm": 3.6133604049682617,
476
  "learning_rate": 9.774309454300472e-05,
477
- "loss": 0.7107,
478
  "step": 670
479
  },
480
  {
481
  "epoch": 0.2176,
482
- "grad_norm": 2.855388641357422,
483
  "learning_rate": 9.763081068942287e-05,
484
- "loss": 0.6938,
485
  "step": 680
486
  },
487
  {
488
  "epoch": 0.2208,
489
- "grad_norm": 2.790356159210205,
490
  "learning_rate": 9.751852683584101e-05,
491
- "loss": 0.6208,
492
  "step": 690
493
  },
494
  {
495
  "epoch": 0.224,
496
- "grad_norm": 3.2559432983398438,
497
  "learning_rate": 9.740624298225916e-05,
498
- "loss": 0.6327,
499
  "step": 700
500
  },
501
  {
502
  "epoch": 0.2272,
503
- "grad_norm": 2.8997764587402344,
504
  "learning_rate": 9.729395912867731e-05,
505
- "loss": 0.6176,
506
  "step": 710
507
  },
508
  {
509
  "epoch": 0.2304,
510
- "grad_norm": 3.3574609756469727,
511
  "learning_rate": 9.718167527509545e-05,
512
- "loss": 0.601,
513
  "step": 720
514
  },
515
  {
516
  "epoch": 0.2336,
517
- "grad_norm": 2.94797945022583,
518
  "learning_rate": 9.706939142151358e-05,
519
- "loss": 0.5904,
520
  "step": 730
521
  },
522
  {
523
  "epoch": 0.2368,
524
- "grad_norm": 2.8964426517486572,
525
  "learning_rate": 9.695710756793174e-05,
526
- "loss": 0.5889,
527
  "step": 740
528
  },
529
  {
530
  "epoch": 0.24,
531
- "grad_norm": 2.505166530609131,
532
  "learning_rate": 9.684482371434989e-05,
533
- "loss": 0.5491,
534
  "step": 750
535
  },
536
  {
537
  "epoch": 0.2432,
538
- "grad_norm": 2.8895745277404785,
539
  "learning_rate": 9.673253986076802e-05,
540
- "loss": 0.5751,
541
  "step": 760
542
  },
543
  {
544
  "epoch": 0.2464,
545
- "grad_norm": 2.348055601119995,
546
  "learning_rate": 9.662025600718617e-05,
547
- "loss": 0.5702,
548
  "step": 770
549
  },
550
  {
551
  "epoch": 0.2496,
552
- "grad_norm": 3.0331923961639404,
553
  "learning_rate": 9.650797215360432e-05,
554
- "loss": 0.5299,
555
  "step": 780
556
  },
557
  {
558
  "epoch": 0.2528,
559
- "grad_norm": 2.881728172302246,
560
  "learning_rate": 9.639568830002246e-05,
561
- "loss": 0.4772,
562
  "step": 790
563
  },
564
  {
565
  "epoch": 0.256,
566
- "grad_norm": 2.9465715885162354,
567
  "learning_rate": 9.628340444644061e-05,
568
- "loss": 0.5137,
569
  "step": 800
570
  },
571
  {
572
  "epoch": 0.2592,
573
- "grad_norm": 2.664348602294922,
574
  "learning_rate": 9.617112059285875e-05,
575
- "loss": 0.4398,
576
  "step": 810
577
  },
578
  {
579
  "epoch": 0.2624,
580
- "grad_norm": 2.931985378265381,
581
  "learning_rate": 9.605883673927689e-05,
582
- "loss": 0.4969,
583
  "step": 820
584
  },
585
  {
586
  "epoch": 0.2656,
587
- "grad_norm": 2.953338861465454,
588
  "learning_rate": 9.594655288569504e-05,
589
- "loss": 0.4829,
590
  "step": 830
591
  },
592
  {
593
  "epoch": 0.2688,
594
- "grad_norm": 3.0215139389038086,
595
  "learning_rate": 9.583426903211319e-05,
596
- "loss": 0.463,
597
  "step": 840
598
  },
599
  {
600
  "epoch": 0.272,
601
- "grad_norm": 3.6885106563568115,
602
  "learning_rate": 9.572198517853134e-05,
603
- "loss": 0.4601,
604
  "step": 850
605
  },
606
  {
607
  "epoch": 0.2752,
608
- "grad_norm": 3.8273849487304688,
609
  "learning_rate": 9.560970132494948e-05,
610
- "loss": 0.5616,
611
  "step": 860
612
  },
613
  {
614
  "epoch": 0.2784,
615
- "grad_norm": 3.266014337539673,
616
  "learning_rate": 9.549741747136763e-05,
617
- "loss": 0.4581,
618
  "step": 870
619
  },
620
  {
621
  "epoch": 0.2816,
622
- "grad_norm": 2.304288387298584,
623
  "learning_rate": 9.538513361778578e-05,
624
- "loss": 0.4192,
625
  "step": 880
626
  },
627
  {
628
  "epoch": 0.2848,
629
- "grad_norm": 2.7526509761810303,
630
  "learning_rate": 9.527284976420391e-05,
631
- "loss": 0.3809,
632
  "step": 890
633
  },
634
  {
635
  "epoch": 0.288,
636
- "grad_norm": 3.033849000930786,
637
  "learning_rate": 9.516056591062205e-05,
638
- "loss": 0.3865,
639
  "step": 900
640
  },
641
  {
642
  "epoch": 0.2912,
643
- "grad_norm": 2.9246816635131836,
644
  "learning_rate": 9.50482820570402e-05,
645
- "loss": 0.3565,
646
  "step": 910
647
  },
648
  {
649
  "epoch": 0.2944,
650
- "grad_norm": 2.111208438873291,
651
  "learning_rate": 9.493599820345834e-05,
652
- "loss": 0.3507,
653
  "step": 920
654
  },
655
  {
656
  "epoch": 0.2976,
657
- "grad_norm": 3.193631172180176,
658
  "learning_rate": 9.482371434987649e-05,
659
- "loss": 0.3136,
660
  "step": 930
661
  },
662
  {
663
  "epoch": 0.3008,
664
- "grad_norm": 2.647897481918335,
665
  "learning_rate": 9.471143049629464e-05,
666
- "loss": 0.3034,
667
  "step": 940
668
  },
669
  {
670
  "epoch": 0.304,
671
- "grad_norm": 4.482762813568115,
672
  "learning_rate": 9.459914664271278e-05,
673
- "loss": 0.3168,
674
  "step": 950
675
  },
676
  {
677
  "epoch": 0.3072,
678
- "grad_norm": 2.408997058868408,
679
  "learning_rate": 9.448686278913093e-05,
680
- "loss": 0.2838,
681
  "step": 960
682
  },
683
  {
684
  "epoch": 0.3104,
685
- "grad_norm": 2.701946496963501,
686
  "learning_rate": 9.437457893554908e-05,
687
- "loss": 0.2945,
688
  "step": 970
689
  },
690
  {
691
  "epoch": 0.3136,
692
- "grad_norm": 2.5696310997009277,
693
  "learning_rate": 9.426229508196722e-05,
694
- "loss": 0.2483,
695
  "step": 980
696
  },
697
  {
698
  "epoch": 0.3168,
699
- "grad_norm": 1.9443378448486328,
700
  "learning_rate": 9.415001122838537e-05,
701
- "loss": 0.2379,
702
  "step": 990
703
  },
704
  {
705
  "epoch": 0.32,
706
- "grad_norm": 2.137860059738159,
707
  "learning_rate": 9.40377273748035e-05,
708
- "loss": 0.2396,
709
  "step": 1000
710
  },
711
  {
712
  "epoch": 0.3232,
713
- "grad_norm": 2.6493215560913086,
714
  "learning_rate": 9.392544352122165e-05,
715
- "loss": 0.2294,
716
  "step": 1010
717
  },
718
  {
719
  "epoch": 0.3264,
720
- "grad_norm": 3.381498336791992,
721
  "learning_rate": 9.381315966763979e-05,
722
- "loss": 0.2162,
723
  "step": 1020
724
  },
725
  {
726
  "epoch": 0.3296,
727
- "grad_norm": 2.907684564590454,
728
  "learning_rate": 9.370087581405794e-05,
729
- "loss": 0.194,
730
  "step": 1030
731
  },
732
  {
733
  "epoch": 0.3328,
734
- "grad_norm": 2.3227946758270264,
735
  "learning_rate": 9.358859196047609e-05,
736
- "loss": 0.2071,
737
  "step": 1040
738
  },
739
  {
740
  "epoch": 0.336,
741
- "grad_norm": 2.3173024654388428,
742
  "learning_rate": 9.347630810689423e-05,
743
- "loss": 0.2018,
744
  "step": 1050
745
  },
746
  {
747
  "epoch": 0.3392,
748
- "grad_norm": 2.9500386714935303,
749
  "learning_rate": 9.336402425331238e-05,
750
- "loss": 0.1987,
751
  "step": 1060
752
  },
753
  {
754
  "epoch": 0.3424,
755
- "grad_norm": 5.206624984741211,
756
  "learning_rate": 9.325174039973053e-05,
757
- "loss": 0.2036,
758
  "step": 1070
759
  },
760
  {
761
  "epoch": 0.3456,
762
- "grad_norm": 1.9075260162353516,
763
  "learning_rate": 9.313945654614867e-05,
764
- "loss": 0.1785,
765
  "step": 1080
766
  },
767
  {
768
  "epoch": 0.3488,
769
- "grad_norm": 1.7478675842285156,
770
  "learning_rate": 9.30271726925668e-05,
771
- "loss": 0.1956,
772
  "step": 1090
773
  },
774
  {
775
  "epoch": 0.352,
776
- "grad_norm": 2.1234798431396484,
777
  "learning_rate": 9.291488883898496e-05,
778
- "loss": 0.175,
779
  "step": 1100
780
  },
781
  {
782
  "epoch": 0.3552,
783
- "grad_norm": 1.828234076499939,
784
  "learning_rate": 9.280260498540311e-05,
785
- "loss": 0.1591,
786
  "step": 1110
787
  },
788
  {
789
  "epoch": 0.3584,
790
- "grad_norm": 1.8341890573501587,
791
  "learning_rate": 9.269032113182124e-05,
792
- "loss": 0.1649,
793
  "step": 1120
794
  },
795
  {
796
  "epoch": 0.3616,
797
- "grad_norm": 1.3575913906097412,
798
  "learning_rate": 9.25780372782394e-05,
799
- "loss": 0.1694,
800
  "step": 1130
801
  },
802
  {
803
  "epoch": 0.3648,
804
- "grad_norm": 2.1410560607910156,
805
  "learning_rate": 9.246575342465755e-05,
806
- "loss": 0.1659,
807
  "step": 1140
808
  },
809
  {
810
  "epoch": 0.368,
811
- "grad_norm": 1.7794946432113647,
812
  "learning_rate": 9.235346957107568e-05,
813
- "loss": 0.1479,
814
  "step": 1150
815
  },
816
  {
817
  "epoch": 0.3712,
818
- "grad_norm": 2.1806530952453613,
819
  "learning_rate": 9.224118571749383e-05,
820
- "loss": 0.151,
821
  "step": 1160
822
  },
823
  {
824
  "epoch": 0.3744,
825
- "grad_norm": 1.5767009258270264,
826
  "learning_rate": 9.212890186391197e-05,
827
- "loss": 0.1541,
828
  "step": 1170
829
  },
830
  {
831
  "epoch": 0.3776,
832
- "grad_norm": 1.453840970993042,
833
  "learning_rate": 9.201661801033011e-05,
834
- "loss": 0.1368,
835
  "step": 1180
836
  },
837
  {
838
  "epoch": 0.3808,
839
- "grad_norm": 1.8884810209274292,
840
  "learning_rate": 9.190433415674826e-05,
841
- "loss": 0.1394,
842
  "step": 1190
843
  },
844
  {
845
  "epoch": 0.384,
846
- "grad_norm": 1.6665995121002197,
847
  "learning_rate": 9.179205030316641e-05,
848
- "loss": 0.1356,
849
  "step": 1200
850
  },
851
  {
852
  "epoch": 0.3872,
853
- "grad_norm": 1.6958074569702148,
854
  "learning_rate": 9.167976644958456e-05,
855
- "loss": 0.1468,
856
  "step": 1210
857
  },
858
  {
859
  "epoch": 0.3904,
860
- "grad_norm": 1.609466552734375,
861
  "learning_rate": 9.15674825960027e-05,
862
- "loss": 0.1396,
863
  "step": 1220
864
  },
865
  {
866
  "epoch": 0.3936,
867
- "grad_norm": 2.045232057571411,
868
  "learning_rate": 9.145519874242085e-05,
869
- "loss": 0.1234,
870
  "step": 1230
871
  },
872
  {
873
  "epoch": 0.3968,
874
- "grad_norm": 1.8025046586990356,
875
  "learning_rate": 9.1342914888839e-05,
876
- "loss": 0.1335,
877
  "step": 1240
878
  },
879
  {
880
  "epoch": 0.4,
881
- "grad_norm": 1.4416459798812866,
882
  "learning_rate": 9.123063103525713e-05,
883
- "loss": 0.1325,
884
  "step": 1250
885
  },
886
  {
887
  "epoch": 0.4032,
888
- "grad_norm": 2.590508460998535,
889
  "learning_rate": 9.111834718167527e-05,
890
- "loss": 0.1258,
891
  "step": 1260
892
  },
893
  {
894
  "epoch": 0.4064,
895
- "grad_norm": 1.7362828254699707,
896
  "learning_rate": 9.100606332809342e-05,
897
- "loss": 0.1337,
898
  "step": 1270
899
  },
900
  {
901
  "epoch": 0.4096,
902
- "grad_norm": 1.726511836051941,
903
  "learning_rate": 9.089377947451156e-05,
904
- "loss": 0.1274,
905
  "step": 1280
906
  },
907
  {
908
  "epoch": 0.4128,
909
- "grad_norm": 1.3185865879058838,
910
  "learning_rate": 9.078149562092971e-05,
911
- "loss": 0.1244,
912
  "step": 1290
913
  },
914
  {
915
  "epoch": 0.416,
916
- "grad_norm": 1.6214089393615723,
917
  "learning_rate": 9.066921176734786e-05,
918
- "loss": 0.1002,
919
  "step": 1300
920
  },
921
  {
922
  "epoch": 0.4192,
923
- "grad_norm": 1.206648826599121,
924
  "learning_rate": 9.055692791376601e-05,
925
- "loss": 0.1169,
926
  "step": 1310
927
  },
928
  {
929
  "epoch": 0.4224,
930
- "grad_norm": 1.020280122756958,
931
  "learning_rate": 9.044464406018415e-05,
932
- "loss": 0.1036,
933
  "step": 1320
934
  },
935
  {
936
  "epoch": 0.4256,
937
- "grad_norm": 1.86745285987854,
938
  "learning_rate": 9.03323602066023e-05,
939
- "loss": 0.1163,
940
  "step": 1330
941
  },
942
  {
943
  "epoch": 0.4288,
944
- "grad_norm": 1.2367403507232666,
945
  "learning_rate": 9.022007635302045e-05,
946
- "loss": 0.1166,
947
  "step": 1340
948
  },
949
  {
950
  "epoch": 0.432,
951
- "grad_norm": 1.3708258867263794,
952
  "learning_rate": 9.010779249943859e-05,
953
- "loss": 0.0956,
954
  "step": 1350
955
  },
956
  {
957
  "epoch": 0.4352,
958
- "grad_norm": 1.9553757905960083,
959
  "learning_rate": 8.999550864585672e-05,
960
- "loss": 0.1139,
961
  "step": 1360
962
  },
963
  {
964
  "epoch": 0.4384,
965
- "grad_norm": 2.1358702182769775,
966
  "learning_rate": 8.988322479227488e-05,
967
- "loss": 0.1094,
968
  "step": 1370
969
  },
970
  {
971
  "epoch": 0.4416,
972
- "grad_norm": 3.4062862396240234,
973
  "learning_rate": 8.977094093869301e-05,
974
- "loss": 0.1017,
975
  "step": 1380
976
  },
977
  {
978
  "epoch": 0.4448,
979
- "grad_norm": 1.4105799198150635,
980
  "learning_rate": 8.965865708511116e-05,
981
- "loss": 0.1008,
982
  "step": 1390
983
  },
984
  {
985
  "epoch": 0.448,
986
- "grad_norm": 1.713500738143921,
987
  "learning_rate": 8.954637323152931e-05,
988
- "loss": 0.1078,
989
  "step": 1400
990
  },
991
  {
992
  "epoch": 0.4512,
993
- "grad_norm": 1.128848671913147,
994
  "learning_rate": 8.943408937794746e-05,
995
- "loss": 0.093,
996
  "step": 1410
997
  },
998
  {
999
  "epoch": 0.4544,
1000
- "grad_norm": 1.3671025037765503,
1001
  "learning_rate": 8.93218055243656e-05,
1002
- "loss": 0.1093,
1003
  "step": 1420
1004
  },
1005
  {
1006
  "epoch": 0.4576,
1007
- "grad_norm": 1.8151606321334839,
1008
  "learning_rate": 8.920952167078375e-05,
1009
- "loss": 0.1088,
1010
  "step": 1430
1011
  },
1012
  {
1013
  "epoch": 0.4608,
1014
- "grad_norm": 1.1207199096679688,
1015
  "learning_rate": 8.909723781720189e-05,
1016
- "loss": 0.1051,
1017
  "step": 1440
1018
  },
1019
  {
1020
  "epoch": 0.464,
1021
- "grad_norm": 1.0649629831314087,
1022
  "learning_rate": 8.898495396362003e-05,
1023
- "loss": 0.1,
1024
  "step": 1450
1025
  },
1026
  {
1027
  "epoch": 0.4672,
1028
- "grad_norm": 1.4365625381469727,
1029
  "learning_rate": 8.887267011003818e-05,
1030
- "loss": 0.0861,
1031
  "step": 1460
1032
  },
1033
  {
1034
  "epoch": 0.4704,
1035
- "grad_norm": 1.1462955474853516,
1036
  "learning_rate": 8.876038625645633e-05,
1037
- "loss": 0.0866,
1038
  "step": 1470
1039
  },
1040
  {
1041
  "epoch": 0.4736,
1042
- "grad_norm": 1.3217401504516602,
1043
  "learning_rate": 8.864810240287447e-05,
1044
- "loss": 0.0979,
1045
  "step": 1480
1046
  },
1047
  {
1048
  "epoch": 0.4768,
1049
- "grad_norm": 1.3830331563949585,
1050
  "learning_rate": 8.853581854929262e-05,
1051
- "loss": 0.0792,
1052
  "step": 1490
1053
  },
1054
  {
1055
  "epoch": 0.48,
1056
- "grad_norm": 1.777252435684204,
1057
  "learning_rate": 8.842353469571077e-05,
1058
- "loss": 0.0727,
1059
  "step": 1500
1060
  },
1061
  {
1062
  "epoch": 0.4832,
1063
- "grad_norm": 1.220021367073059,
1064
  "learning_rate": 8.83112508421289e-05,
1065
- "loss": 0.0836,
1066
  "step": 1510
1067
  },
1068
  {
1069
  "epoch": 0.4864,
1070
- "grad_norm": 2.3963851928710938,
1071
  "learning_rate": 8.819896698854705e-05,
1072
- "loss": 0.0851,
1073
  "step": 1520
1074
  },
1075
  {
1076
  "epoch": 0.4896,
1077
- "grad_norm": 1.1223018169403076,
1078
  "learning_rate": 8.808668313496519e-05,
1079
- "loss": 0.082,
1080
  "step": 1530
1081
  },
1082
  {
1083
  "epoch": 0.4928,
1084
- "grad_norm": 1.148827314376831,
1085
  "learning_rate": 8.797439928138334e-05,
1086
- "loss": 0.0758,
1087
  "step": 1540
1088
  },
1089
  {
1090
  "epoch": 0.496,
1091
- "grad_norm": 0.8612151145935059,
1092
  "learning_rate": 8.786211542780148e-05,
1093
- "loss": 0.0783,
1094
  "step": 1550
1095
  },
1096
  {
1097
  "epoch": 0.4992,
1098
- "grad_norm": 1.1042686700820923,
1099
  "learning_rate": 8.774983157421963e-05,
1100
- "loss": 0.0846,
1101
  "step": 1560
1102
  },
1103
  {
1104
  "epoch": 0.5024,
1105
- "grad_norm": 1.3059678077697754,
1106
  "learning_rate": 8.763754772063778e-05,
1107
- "loss": 0.0842,
1108
  "step": 1570
1109
  },
1110
  {
1111
  "epoch": 0.5056,
1112
- "grad_norm": 1.976445198059082,
1113
  "learning_rate": 8.752526386705592e-05,
1114
- "loss": 0.0811,
1115
  "step": 1580
1116
  },
1117
  {
1118
  "epoch": 0.5088,
1119
- "grad_norm": 1.7577661275863647,
1120
  "learning_rate": 8.741298001347407e-05,
1121
- "loss": 0.0848,
1122
  "step": 1590
1123
  },
1124
  {
1125
  "epoch": 0.512,
1126
- "grad_norm": 1.2085758447647095,
1127
  "learning_rate": 8.730069615989222e-05,
1128
- "loss": 0.077,
1129
  "step": 1600
1130
  },
1131
  {
1132
  "epoch": 0.5152,
1133
- "grad_norm": 1.1045840978622437,
1134
  "learning_rate": 8.718841230631036e-05,
1135
- "loss": 0.0794,
1136
  "step": 1610
1137
  },
1138
  {
1139
  "epoch": 0.5184,
1140
- "grad_norm": 2.760986328125,
1141
  "learning_rate": 8.70761284527285e-05,
1142
- "loss": 0.0887,
1143
  "step": 1620
1144
  },
1145
  {
1146
  "epoch": 0.5216,
1147
- "grad_norm": 1.1649103164672852,
1148
  "learning_rate": 8.696384459914664e-05,
1149
- "loss": 0.0806,
1150
  "step": 1630
1151
  },
1152
  {
1153
  "epoch": 0.5248,
1154
- "grad_norm": 2.1718943119049072,
1155
  "learning_rate": 8.68515607455648e-05,
1156
- "loss": 0.0774,
1157
  "step": 1640
1158
  },
1159
  {
1160
  "epoch": 0.528,
1161
- "grad_norm": 5.1032586097717285,
1162
  "learning_rate": 8.673927689198293e-05,
1163
- "loss": 0.0823,
1164
  "step": 1650
1165
  },
1166
  {
1167
  "epoch": 0.5312,
1168
- "grad_norm": 1.3188016414642334,
1169
  "learning_rate": 8.662699303840108e-05,
1170
- "loss": 0.0723,
1171
  "step": 1660
1172
  },
1173
  {
1174
  "epoch": 0.5344,
1175
- "grad_norm": 1.1091735363006592,
1176
  "learning_rate": 8.651470918481923e-05,
1177
- "loss": 0.0801,
1178
  "step": 1670
1179
  },
1180
  {
1181
  "epoch": 0.5376,
1182
- "grad_norm": 1.944667935371399,
1183
  "learning_rate": 8.640242533123737e-05,
1184
- "loss": 0.0656,
1185
  "step": 1680
1186
  },
1187
  {
1188
  "epoch": 0.5408,
1189
- "grad_norm": 1.409705400466919,
1190
  "learning_rate": 8.629014147765552e-05,
1191
- "loss": 0.0783,
1192
  "step": 1690
1193
  },
1194
  {
1195
  "epoch": 0.544,
1196
- "grad_norm": 1.0202471017837524,
1197
  "learning_rate": 8.617785762407367e-05,
1198
- "loss": 0.0698,
1199
  "step": 1700
1200
  },
1201
  {
1202
  "epoch": 0.5472,
1203
- "grad_norm": 0.8339371681213379,
1204
  "learning_rate": 8.606557377049181e-05,
1205
- "loss": 0.0646,
1206
  "step": 1710
1207
  },
1208
  {
1209
  "epoch": 0.5504,
1210
- "grad_norm": 1.0416721105575562,
1211
  "learning_rate": 8.595328991690995e-05,
1212
- "loss": 0.0631,
1213
  "step": 1720
1214
  },
1215
  {
1216
  "epoch": 0.5536,
1217
- "grad_norm": 0.812588632106781,
1218
  "learning_rate": 8.58410060633281e-05,
1219
- "loss": 0.0718,
1220
  "step": 1730
1221
  },
1222
  {
1223
  "epoch": 0.5568,
1224
- "grad_norm": 1.2019861936569214,
1225
  "learning_rate": 8.572872220974623e-05,
1226
- "loss": 0.0766,
1227
  "step": 1740
1228
  },
1229
  {
1230
  "epoch": 0.56,
1231
- "grad_norm": 0.8724514842033386,
1232
  "learning_rate": 8.561643835616438e-05,
1233
- "loss": 0.066,
1234
  "step": 1750
1235
  },
1236
  {
1237
  "epoch": 0.5632,
1238
- "grad_norm": 0.9269486665725708,
1239
  "learning_rate": 8.550415450258253e-05,
1240
- "loss": 0.0711,
1241
  "step": 1760
1242
  },
1243
  {
1244
  "epoch": 0.5664,
1245
- "grad_norm": 1.1435014009475708,
1246
  "learning_rate": 8.539187064900069e-05,
1247
- "loss": 0.0649,
1248
  "step": 1770
1249
  },
1250
  {
1251
  "epoch": 0.5696,
1252
- "grad_norm": 0.9017223119735718,
1253
  "learning_rate": 8.527958679541882e-05,
1254
- "loss": 0.0743,
1255
  "step": 1780
1256
  },
1257
  {
1258
  "epoch": 0.5728,
1259
- "grad_norm": 1.5114269256591797,
1260
  "learning_rate": 8.516730294183697e-05,
1261
- "loss": 0.0711,
1262
  "step": 1790
1263
  },
1264
  {
1265
  "epoch": 0.576,
1266
- "grad_norm": 1.1478126049041748,
1267
  "learning_rate": 8.505501908825511e-05,
1268
- "loss": 0.0749,
1269
  "step": 1800
1270
  },
1271
  {
1272
  "epoch": 0.5792,
1273
- "grad_norm": 1.3925341367721558,
1274
  "learning_rate": 8.494273523467325e-05,
1275
- "loss": 0.0623,
1276
  "step": 1810
1277
  },
1278
  {
1279
  "epoch": 0.5824,
1280
- "grad_norm": 0.7855392098426819,
1281
  "learning_rate": 8.48304513810914e-05,
1282
- "loss": 0.0568,
1283
  "step": 1820
1284
  },
1285
  {
1286
  "epoch": 0.5856,
1287
- "grad_norm": 0.7520506381988525,
1288
  "learning_rate": 8.471816752750955e-05,
1289
- "loss": 0.0654,
1290
  "step": 1830
1291
  },
1292
  {
1293
  "epoch": 0.5888,
1294
- "grad_norm": 0.8448010087013245,
1295
  "learning_rate": 8.460588367392769e-05,
1296
- "loss": 0.0624,
1297
  "step": 1840
1298
  },
1299
  {
1300
  "epoch": 0.592,
1301
- "grad_norm": 0.7805534601211548,
1302
  "learning_rate": 8.449359982034584e-05,
1303
- "loss": 0.0676,
1304
  "step": 1850
1305
  },
1306
  {
1307
  "epoch": 0.5952,
1308
- "grad_norm": 1.1754975318908691,
1309
  "learning_rate": 8.438131596676399e-05,
1310
- "loss": 0.0609,
1311
  "step": 1860
1312
  },
1313
  {
1314
  "epoch": 0.5984,
1315
- "grad_norm": 0.7776190638542175,
1316
  "learning_rate": 8.426903211318214e-05,
1317
- "loss": 0.0601,
1318
  "step": 1870
1319
  },
1320
  {
1321
  "epoch": 0.6016,
1322
- "grad_norm": 1.610683560371399,
1323
  "learning_rate": 8.415674825960028e-05,
1324
- "loss": 0.0618,
1325
  "step": 1880
1326
  },
1327
  {
1328
  "epoch": 0.6048,
1329
- "grad_norm": 0.8926658630371094,
1330
  "learning_rate": 8.404446440601843e-05,
1331
- "loss": 0.0582,
1332
  "step": 1890
1333
  },
1334
  {
1335
  "epoch": 0.608,
1336
- "grad_norm": 1.2540363073349,
1337
  "learning_rate": 8.393218055243656e-05,
1338
- "loss": 0.0623,
1339
  "step": 1900
1340
  },
1341
  {
1342
  "epoch": 0.6112,
1343
- "grad_norm": 1.463120937347412,
1344
  "learning_rate": 8.38198966988547e-05,
1345
- "loss": 0.0595,
1346
  "step": 1910
1347
  },
1348
  {
1349
  "epoch": 0.6144,
1350
- "grad_norm": 1.3101223707199097,
1351
  "learning_rate": 8.370761284527285e-05,
1352
- "loss": 0.0653,
1353
  "step": 1920
1354
  },
1355
  {
1356
  "epoch": 0.6176,
1357
- "grad_norm": 1.1716495752334595,
1358
  "learning_rate": 8.3595328991691e-05,
1359
- "loss": 0.0518,
1360
  "step": 1930
1361
  },
1362
  {
1363
  "epoch": 0.6208,
1364
- "grad_norm": 1.539556860923767,
1365
  "learning_rate": 8.348304513810914e-05,
1366
- "loss": 0.0496,
1367
  "step": 1940
1368
  },
1369
  {
1370
  "epoch": 0.624,
1371
- "grad_norm": 0.8535395264625549,
1372
  "learning_rate": 8.337076128452729e-05,
1373
- "loss": 0.0581,
1374
  "step": 1950
1375
  },
1376
  {
1377
  "epoch": 0.6272,
1378
- "grad_norm": 0.9112345576286316,
1379
  "learning_rate": 8.325847743094544e-05,
1380
- "loss": 0.0568,
1381
  "step": 1960
1382
  },
1383
  {
1384
  "epoch": 0.6304,
1385
- "grad_norm": 0.9361368417739868,
1386
  "learning_rate": 8.314619357736358e-05,
1387
- "loss": 0.0507,
1388
  "step": 1970
1389
  },
1390
  {
1391
  "epoch": 0.6336,
1392
- "grad_norm": 1.4482288360595703,
1393
  "learning_rate": 8.303390972378173e-05,
1394
- "loss": 0.0521,
1395
  "step": 1980
1396
  },
1397
  {
1398
  "epoch": 0.6368,
1399
- "grad_norm": 0.9236279129981995,
1400
  "learning_rate": 8.292162587019986e-05,
1401
- "loss": 0.0512,
1402
  "step": 1990
1403
  },
1404
  {
1405
  "epoch": 0.64,
1406
- "grad_norm": 1.238692045211792,
1407
  "learning_rate": 8.280934201661802e-05,
1408
- "loss": 0.0538,
1409
  "step": 2000
1410
  },
1411
  {
1412
  "epoch": 0.6432,
1413
- "grad_norm": 0.9093800187110901,
1414
  "learning_rate": 8.269705816303615e-05,
1415
- "loss": 0.054,
1416
  "step": 2010
1417
  },
1418
  {
1419
  "epoch": 0.6464,
1420
- "grad_norm": 0.7657427787780762,
1421
  "learning_rate": 8.25847743094543e-05,
1422
- "loss": 0.049,
1423
  "step": 2020
1424
  },
1425
  {
1426
  "epoch": 0.6496,
1427
- "grad_norm": 1.5526565313339233,
1428
  "learning_rate": 8.247249045587245e-05,
1429
- "loss": 0.0524,
1430
  "step": 2030
1431
  },
1432
  {
1433
  "epoch": 0.6528,
1434
- "grad_norm": 0.7204543352127075,
1435
  "learning_rate": 8.236020660229059e-05,
1436
- "loss": 0.0555,
1437
  "step": 2040
1438
  },
1439
  {
1440
  "epoch": 0.656,
1441
- "grad_norm": 0.8594505190849304,
1442
  "learning_rate": 8.224792274870874e-05,
1443
- "loss": 0.0529,
1444
  "step": 2050
1445
  },
1446
  {
1447
  "epoch": 0.6592,
1448
- "grad_norm": 1.0530890226364136,
1449
  "learning_rate": 8.213563889512689e-05,
1450
- "loss": 0.0621,
1451
  "step": 2060
1452
  },
1453
  {
1454
  "epoch": 0.6624,
1455
- "grad_norm": 0.92835533618927,
1456
  "learning_rate": 8.202335504154503e-05,
1457
- "loss": 0.0434,
1458
  "step": 2070
1459
  },
1460
  {
1461
  "epoch": 0.6656,
1462
- "grad_norm": 1.0584304332733154,
1463
  "learning_rate": 8.191107118796317e-05,
1464
- "loss": 0.0515,
1465
  "step": 2080
1466
  },
1467
  {
1468
  "epoch": 0.6688,
1469
- "grad_norm": 0.7033383250236511,
1470
  "learning_rate": 8.179878733438132e-05,
1471
- "loss": 0.0451,
1472
  "step": 2090
1473
  },
1474
  {
1475
  "epoch": 0.672,
1476
- "grad_norm": 1.09385085105896,
1477
  "learning_rate": 8.168650348079947e-05,
1478
- "loss": 0.0486,
1479
  "step": 2100
1480
  },
1481
  {
1482
  "epoch": 0.6752,
1483
- "grad_norm": 1.0709513425827026,
1484
  "learning_rate": 8.15742196272176e-05,
1485
- "loss": 0.0571,
1486
  "step": 2110
1487
  },
1488
  {
1489
  "epoch": 0.6784,
1490
- "grad_norm": 0.7316601276397705,
1491
  "learning_rate": 8.146193577363576e-05,
1492
- "loss": 0.0496,
1493
  "step": 2120
1494
  },
1495
  {
1496
  "epoch": 0.6816,
1497
- "grad_norm": 0.9458874464035034,
1498
  "learning_rate": 8.13496519200539e-05,
1499
- "loss": 0.054,
1500
  "step": 2130
1501
  },
1502
  {
1503
  "epoch": 0.6848,
1504
- "grad_norm": 2.1238739490509033,
1505
  "learning_rate": 8.123736806647204e-05,
1506
- "loss": 0.0535,
1507
  "step": 2140
1508
  },
1509
  {
1510
  "epoch": 0.688,
1511
- "grad_norm": 1.011391520500183,
1512
  "learning_rate": 8.11250842128902e-05,
1513
- "loss": 0.0527,
1514
  "step": 2150
1515
  },
1516
  {
1517
  "epoch": 0.6912,
1518
- "grad_norm": 0.8783167004585266,
1519
  "learning_rate": 8.101280035930835e-05,
1520
- "loss": 0.0424,
1521
  "step": 2160
1522
  },
1523
  {
1524
  "epoch": 0.6944,
1525
- "grad_norm": 0.9206530451774597,
1526
  "learning_rate": 8.090051650572648e-05,
1527
- "loss": 0.0588,
1528
  "step": 2170
1529
  },
1530
  {
1531
  "epoch": 0.6976,
1532
- "grad_norm": 0.5304967164993286,
1533
  "learning_rate": 8.078823265214462e-05,
1534
- "loss": 0.0574,
1535
  "step": 2180
1536
  },
1537
  {
1538
  "epoch": 0.7008,
1539
- "grad_norm": 0.5870644450187683,
1540
  "learning_rate": 8.067594879856277e-05,
1541
- "loss": 0.0512,
1542
  "step": 2190
1543
  },
1544
  {
1545
  "epoch": 0.704,
1546
- "grad_norm": 0.7831016182899475,
1547
  "learning_rate": 8.056366494498092e-05,
1548
- "loss": 0.0485,
1549
  "step": 2200
1550
  },
1551
  {
1552
  "epoch": 0.7072,
1553
- "grad_norm": 2.5291478633880615,
1554
  "learning_rate": 8.045138109139906e-05,
1555
- "loss": 0.058,
1556
  "step": 2210
1557
  },
1558
  {
1559
  "epoch": 0.7104,
1560
- "grad_norm": 0.705797553062439,
1561
  "learning_rate": 8.033909723781721e-05,
1562
- "loss": 0.0534,
1563
  "step": 2220
1564
  },
1565
  {
1566
  "epoch": 0.7136,
1567
- "grad_norm": 0.7620034217834473,
1568
  "learning_rate": 8.022681338423536e-05,
1569
- "loss": 0.0466,
1570
  "step": 2230
1571
  },
1572
  {
1573
  "epoch": 0.7168,
1574
- "grad_norm": 2.0571022033691406,
1575
  "learning_rate": 8.01145295306535e-05,
1576
- "loss": 0.0469,
1577
  "step": 2240
1578
  },
1579
  {
1580
  "epoch": 0.72,
1581
- "grad_norm": 1.094312310218811,
1582
  "learning_rate": 8.000224567707165e-05,
1583
- "loss": 0.0482,
1584
  "step": 2250
1585
  },
1586
  {
1587
  "epoch": 0.7232,
1588
- "grad_norm": 0.857584536075592,
1589
  "learning_rate": 7.988996182348978e-05,
1590
- "loss": 0.044,
1591
  "step": 2260
1592
  },
1593
  {
1594
  "epoch": 0.7264,
1595
- "grad_norm": 0.7637543082237244,
1596
  "learning_rate": 7.977767796990792e-05,
1597
- "loss": 0.0493,
1598
  "step": 2270
1599
  },
1600
  {
1601
  "epoch": 0.7296,
1602
- "grad_norm": 0.6703894138336182,
1603
  "learning_rate": 7.966539411632607e-05,
1604
- "loss": 0.0462,
1605
  "step": 2280
1606
  },
1607
  {
1608
  "epoch": 0.7328,
1609
- "grad_norm": 0.7242295145988464,
1610
  "learning_rate": 7.955311026274422e-05,
1611
- "loss": 0.044,
1612
  "step": 2290
1613
  },
1614
  {
1615
  "epoch": 0.736,
1616
- "grad_norm": 1.0875325202941895,
1617
  "learning_rate": 7.944082640916236e-05,
1618
- "loss": 0.0458,
1619
  "step": 2300
1620
  },
1621
  {
1622
  "epoch": 0.7392,
1623
- "grad_norm": 0.8322548270225525,
1624
  "learning_rate": 7.932854255558051e-05,
1625
- "loss": 0.0471,
1626
  "step": 2310
1627
  },
1628
  {
1629
  "epoch": 0.7424,
1630
- "grad_norm": 1.6921989917755127,
1631
  "learning_rate": 7.921625870199866e-05,
1632
- "loss": 0.0481,
1633
  "step": 2320
1634
  },
1635
  {
1636
  "epoch": 0.7456,
1637
- "grad_norm": 0.9339900612831116,
1638
  "learning_rate": 7.910397484841681e-05,
1639
- "loss": 0.0438,
1640
  "step": 2330
1641
  },
1642
  {
1643
  "epoch": 0.7488,
1644
- "grad_norm": 0.9007784724235535,
1645
  "learning_rate": 7.899169099483495e-05,
1646
- "loss": 0.051,
1647
  "step": 2340
1648
  },
1649
  {
1650
  "epoch": 0.752,
1651
- "grad_norm": 0.7366623282432556,
1652
  "learning_rate": 7.887940714125309e-05,
1653
- "loss": 0.0488,
1654
  "step": 2350
1655
  },
1656
  {
1657
  "epoch": 0.7552,
1658
- "grad_norm": 1.0581986904144287,
1659
  "learning_rate": 7.876712328767124e-05,
1660
- "loss": 0.0465,
1661
  "step": 2360
1662
  },
1663
  {
1664
  "epoch": 0.7584,
1665
- "grad_norm": 0.8398572206497192,
1666
  "learning_rate": 7.865483943408937e-05,
1667
- "loss": 0.0444,
1668
  "step": 2370
1669
  },
1670
  {
1671
  "epoch": 0.7616,
1672
- "grad_norm": 0.829765796661377,
1673
  "learning_rate": 7.854255558050752e-05,
1674
- "loss": 0.0468,
1675
  "step": 2380
1676
  },
1677
  {
1678
  "epoch": 0.7648,
1679
- "grad_norm": 0.8922726511955261,
1680
  "learning_rate": 7.843027172692568e-05,
1681
- "loss": 0.0429,
1682
  "step": 2390
1683
  },
1684
  {
1685
  "epoch": 0.768,
1686
- "grad_norm": 0.9981004595756531,
1687
  "learning_rate": 7.831798787334381e-05,
1688
- "loss": 0.0425,
1689
  "step": 2400
1690
  },
1691
  {
1692
  "epoch": 0.7712,
1693
- "grad_norm": 1.1781072616577148,
1694
  "learning_rate": 7.820570401976196e-05,
1695
- "loss": 0.0467,
1696
  "step": 2410
1697
  },
1698
  {
1699
  "epoch": 0.7744,
1700
- "grad_norm": 1.295114517211914,
1701
  "learning_rate": 7.809342016618011e-05,
1702
- "loss": 0.0429,
1703
  "step": 2420
1704
  },
1705
  {
1706
  "epoch": 0.7776,
1707
- "grad_norm": 1.2923245429992676,
1708
  "learning_rate": 7.798113631259825e-05,
1709
- "loss": 0.0411,
1710
  "step": 2430
1711
  },
1712
  {
1713
  "epoch": 0.7808,
1714
- "grad_norm": 0.9972735643386841,
1715
  "learning_rate": 7.78688524590164e-05,
1716
- "loss": 0.0392,
1717
  "step": 2440
1718
  },
1719
  {
1720
  "epoch": 0.784,
1721
- "grad_norm": 0.7741293907165527,
1722
  "learning_rate": 7.775656860543454e-05,
1723
- "loss": 0.0432,
1724
  "step": 2450
1725
  },
1726
  {
1727
  "epoch": 0.7872,
1728
- "grad_norm": 0.5855127573013306,
1729
  "learning_rate": 7.764428475185269e-05,
1730
- "loss": 0.0468,
1731
  "step": 2460
1732
  },
1733
  {
1734
  "epoch": 0.7904,
1735
- "grad_norm": 0.6745654940605164,
1736
  "learning_rate": 7.753200089827083e-05,
1737
- "loss": 0.0394,
1738
  "step": 2470
1739
  },
1740
  {
1741
  "epoch": 0.7936,
1742
- "grad_norm": 1.2831262350082397,
1743
  "learning_rate": 7.741971704468898e-05,
1744
- "loss": 0.037,
1745
  "step": 2480
1746
  },
1747
  {
1748
  "epoch": 0.7968,
1749
- "grad_norm": 0.6621804237365723,
1750
  "learning_rate": 7.730743319110713e-05,
1751
- "loss": 0.0413,
1752
  "step": 2490
1753
  },
1754
  {
1755
  "epoch": 0.8,
1756
- "grad_norm": 0.5354174375534058,
1757
  "learning_rate": 7.719514933752526e-05,
1758
- "loss": 0.0375,
1759
  "step": 2500
1760
  },
1761
  {
1762
  "epoch": 0.8032,
1763
- "grad_norm": 0.9441866278648376,
1764
  "learning_rate": 7.708286548394342e-05,
1765
- "loss": 0.0364,
1766
  "step": 2510
1767
  },
1768
  {
1769
  "epoch": 0.8064,
1770
- "grad_norm": 0.6830460429191589,
1771
  "learning_rate": 7.697058163036157e-05,
1772
- "loss": 0.042,
1773
  "step": 2520
1774
  },
1775
  {
1776
  "epoch": 0.8096,
1777
- "grad_norm": 0.6004045605659485,
1778
  "learning_rate": 7.68582977767797e-05,
1779
- "loss": 0.04,
1780
  "step": 2530
1781
  },
1782
  {
1783
  "epoch": 0.8128,
1784
- "grad_norm": 0.6852745413780212,
1785
  "learning_rate": 7.674601392319784e-05,
1786
- "loss": 0.0404,
1787
  "step": 2540
1788
  },
1789
  {
1790
  "epoch": 0.816,
1791
- "grad_norm": 1.0080032348632812,
1792
  "learning_rate": 7.663373006961599e-05,
1793
- "loss": 0.043,
1794
  "step": 2550
1795
  },
1796
  {
1797
  "epoch": 0.8192,
1798
- "grad_norm": 0.7791699767112732,
1799
  "learning_rate": 7.652144621603414e-05,
1800
- "loss": 0.0411,
1801
  "step": 2560
1802
  },
1803
  {
1804
  "epoch": 0.8224,
1805
- "grad_norm": 1.192091464996338,
1806
  "learning_rate": 7.640916236245228e-05,
1807
- "loss": 0.0413,
1808
  "step": 2570
1809
  },
1810
  {
1811
  "epoch": 0.8256,
1812
- "grad_norm": 0.744788646697998,
1813
  "learning_rate": 7.629687850887043e-05,
1814
- "loss": 0.0401,
1815
  "step": 2580
1816
  },
1817
  {
1818
  "epoch": 0.8288,
1819
- "grad_norm": 0.6732345223426819,
1820
  "learning_rate": 7.618459465528858e-05,
1821
- "loss": 0.0426,
1822
  "step": 2590
1823
  },
1824
  {
1825
  "epoch": 0.832,
1826
- "grad_norm": 1.0868829488754272,
1827
  "learning_rate": 7.607231080170672e-05,
1828
- "loss": 0.0368,
1829
  "step": 2600
1830
  },
1831
  {
1832
  "epoch": 0.8352,
1833
- "grad_norm": 1.4248939752578735,
1834
  "learning_rate": 7.596002694812487e-05,
1835
- "loss": 0.0369,
1836
  "step": 2610
1837
  },
1838
  {
1839
  "epoch": 0.8384,
1840
- "grad_norm": 0.7218601107597351,
1841
  "learning_rate": 7.5847743094543e-05,
1842
- "loss": 0.0415,
1843
  "step": 2620
1844
  },
1845
  {
1846
  "epoch": 0.8416,
1847
- "grad_norm": 0.803717851638794,
1848
  "learning_rate": 7.573545924096114e-05,
1849
- "loss": 0.0352,
1850
  "step": 2630
1851
  },
1852
  {
1853
  "epoch": 0.8448,
1854
- "grad_norm": 0.821607768535614,
1855
  "learning_rate": 7.562317538737929e-05,
1856
- "loss": 0.039,
1857
  "step": 2640
1858
  },
1859
  {
1860
  "epoch": 0.848,
1861
- "grad_norm": 1.1404283046722412,
1862
  "learning_rate": 7.551089153379744e-05,
1863
- "loss": 0.0369,
1864
  "step": 2650
1865
  },
1866
  {
1867
  "epoch": 0.8512,
1868
- "grad_norm": 1.2288737297058105,
1869
  "learning_rate": 7.53986076802156e-05,
1870
- "loss": 0.0417,
1871
  "step": 2660
1872
  },
1873
  {
1874
  "epoch": 0.8544,
1875
- "grad_norm": 1.0263468027114868,
1876
  "learning_rate": 7.528632382663373e-05,
1877
- "loss": 0.0423,
1878
  "step": 2670
1879
  },
1880
  {
1881
  "epoch": 0.8576,
1882
- "grad_norm": 0.8517736196517944,
1883
  "learning_rate": 7.517403997305188e-05,
1884
- "loss": 0.0364,
1885
  "step": 2680
1886
  },
1887
  {
1888
  "epoch": 0.8608,
1889
- "grad_norm": 0.8727993369102478,
1890
  "learning_rate": 7.506175611947003e-05,
1891
- "loss": 0.0382,
1892
  "step": 2690
1893
  },
1894
  {
1895
  "epoch": 0.864,
1896
- "grad_norm": 0.7277560234069824,
1897
  "learning_rate": 7.494947226588817e-05,
1898
- "loss": 0.0368,
1899
  "step": 2700
1900
  },
1901
  {
1902
  "epoch": 0.8672,
1903
- "grad_norm": 0.854989230632782,
1904
  "learning_rate": 7.483718841230631e-05,
1905
- "loss": 0.0431,
1906
  "step": 2710
1907
  },
1908
  {
1909
  "epoch": 0.8704,
1910
- "grad_norm": 0.47089987993240356,
1911
  "learning_rate": 7.472490455872446e-05,
1912
- "loss": 0.0372,
1913
  "step": 2720
1914
  },
1915
  {
1916
  "epoch": 0.8736,
1917
- "grad_norm": 0.716643750667572,
1918
  "learning_rate": 7.46126207051426e-05,
1919
- "loss": 0.0348,
1920
  "step": 2730
1921
  },
1922
  {
1923
  "epoch": 0.8768,
1924
- "grad_norm": 0.8277871012687683,
1925
  "learning_rate": 7.450033685156075e-05,
1926
- "loss": 0.0369,
1927
  "step": 2740
1928
  },
1929
  {
1930
  "epoch": 0.88,
1931
- "grad_norm": 0.9618933796882629,
1932
  "learning_rate": 7.43880529979789e-05,
1933
- "loss": 0.0377,
1934
  "step": 2750
1935
  },
1936
  {
1937
  "epoch": 0.8832,
1938
- "grad_norm": 0.6898852586746216,
1939
  "learning_rate": 7.427576914439703e-05,
1940
- "loss": 0.0543,
1941
  "step": 2760
1942
  },
1943
  {
1944
  "epoch": 0.8864,
1945
- "grad_norm": 1.4362825155258179,
1946
  "learning_rate": 7.416348529081518e-05,
1947
- "loss": 0.0397,
1948
  "step": 2770
1949
  },
1950
  {
1951
  "epoch": 0.8896,
1952
- "grad_norm": 1.1972767114639282,
1953
  "learning_rate": 7.405120143723333e-05,
1954
- "loss": 0.0324,
1955
  "step": 2780
1956
  },
1957
  {
1958
  "epoch": 0.8928,
1959
- "grad_norm": 0.5438815355300903,
1960
  "learning_rate": 7.393891758365149e-05,
1961
- "loss": 0.0397,
1962
  "step": 2790
1963
  },
1964
  {
1965
  "epoch": 0.896,
1966
- "grad_norm": 0.513469398021698,
1967
  "learning_rate": 7.382663373006962e-05,
1968
- "loss": 0.0338,
1969
  "step": 2800
1970
  },
1971
  {
1972
  "epoch": 0.8992,
1973
- "grad_norm": 0.5743911266326904,
1974
  "learning_rate": 7.371434987648776e-05,
1975
- "loss": 0.0313,
1976
  "step": 2810
1977
  },
1978
  {
1979
  "epoch": 0.9024,
1980
- "grad_norm": 1.011957049369812,
1981
  "learning_rate": 7.360206602290591e-05,
1982
- "loss": 0.0374,
1983
  "step": 2820
1984
  },
1985
  {
1986
  "epoch": 0.9056,
1987
- "grad_norm": 0.6926620602607727,
1988
  "learning_rate": 7.348978216932405e-05,
1989
- "loss": 0.0392,
1990
  "step": 2830
1991
  },
1992
  {
1993
  "epoch": 0.9088,
1994
- "grad_norm": 0.6338510513305664,
1995
  "learning_rate": 7.33774983157422e-05,
1996
- "loss": 0.0359,
1997
  "step": 2840
1998
  },
1999
  {
2000
  "epoch": 0.912,
2001
- "grad_norm": 0.7649824023246765,
2002
  "learning_rate": 7.326521446216035e-05,
2003
- "loss": 0.0353,
2004
  "step": 2850
2005
  },
2006
  {
2007
  "epoch": 0.9152,
2008
- "grad_norm": 0.8123289346694946,
2009
  "learning_rate": 7.315293060857849e-05,
2010
- "loss": 0.0322,
2011
  "step": 2860
2012
  },
2013
  {
2014
  "epoch": 0.9184,
2015
- "grad_norm": 0.8033359050750732,
2016
  "learning_rate": 7.304064675499664e-05,
2017
- "loss": 0.0331,
2018
  "step": 2870
2019
  },
2020
  {
2021
  "epoch": 0.9216,
2022
- "grad_norm": 0.8859496116638184,
2023
  "learning_rate": 7.292836290141479e-05,
2024
- "loss": 0.0352,
2025
  "step": 2880
2026
  },
2027
  {
2028
  "epoch": 0.9248,
2029
- "grad_norm": 0.7962930202484131,
2030
  "learning_rate": 7.281607904783292e-05,
2031
- "loss": 0.0373,
2032
  "step": 2890
2033
  },
2034
  {
2035
  "epoch": 0.928,
2036
- "grad_norm": 0.746497392654419,
2037
  "learning_rate": 7.270379519425106e-05,
2038
- "loss": 0.0426,
2039
  "step": 2900
2040
  },
2041
  {
2042
  "epoch": 0.9312,
2043
- "grad_norm": 0.8344641327857971,
2044
  "learning_rate": 7.259151134066921e-05,
2045
- "loss": 0.0349,
2046
  "step": 2910
2047
  },
2048
  {
2049
  "epoch": 0.9344,
2050
- "grad_norm": 0.8275250792503357,
2051
  "learning_rate": 7.247922748708736e-05,
2052
- "loss": 0.0358,
2053
  "step": 2920
2054
  },
2055
  {
2056
  "epoch": 0.9376,
2057
- "grad_norm": 0.5994471907615662,
2058
  "learning_rate": 7.23669436335055e-05,
2059
- "loss": 0.0344,
2060
  "step": 2930
2061
  },
2062
  {
2063
  "epoch": 0.9408,
2064
- "grad_norm": 0.6452350616455078,
2065
  "learning_rate": 7.225465977992365e-05,
2066
- "loss": 0.0358,
2067
  "step": 2940
2068
  },
2069
  {
2070
  "epoch": 0.944,
2071
- "grad_norm": 1.0141571760177612,
2072
  "learning_rate": 7.21423759263418e-05,
2073
- "loss": 0.0347,
2074
  "step": 2950
2075
  },
2076
  {
2077
  "epoch": 0.9472,
2078
- "grad_norm": 0.832384467124939,
2079
  "learning_rate": 7.203009207275994e-05,
2080
- "loss": 0.0332,
2081
  "step": 2960
2082
  },
2083
  {
2084
  "epoch": 0.9504,
2085
- "grad_norm": 0.7129203677177429,
2086
  "learning_rate": 7.191780821917809e-05,
2087
- "loss": 0.0313,
2088
  "step": 2970
2089
  },
2090
  {
2091
  "epoch": 0.9536,
2092
- "grad_norm": 0.7890746593475342,
2093
  "learning_rate": 7.180552436559623e-05,
2094
- "loss": 0.0331,
2095
  "step": 2980
2096
  },
2097
  {
2098
  "epoch": 0.9568,
2099
- "grad_norm": 1.432335615158081,
2100
  "learning_rate": 7.169324051201438e-05,
2101
- "loss": 0.0353,
2102
  "step": 2990
2103
  },
2104
  {
2105
  "epoch": 0.96,
2106
- "grad_norm": 1.0536537170410156,
2107
  "learning_rate": 7.158095665843251e-05,
2108
- "loss": 0.039,
2109
  "step": 3000
2110
  },
2111
  {
2112
  "epoch": 0.9632,
2113
- "grad_norm": 0.7935389280319214,
2114
  "learning_rate": 7.146867280485066e-05,
2115
- "loss": 0.0334,
2116
  "step": 3010
2117
  },
2118
  {
2119
  "epoch": 0.9664,
2120
- "grad_norm": 1.4054580926895142,
2121
  "learning_rate": 7.135638895126882e-05,
2122
- "loss": 0.033,
2123
  "step": 3020
2124
  },
2125
  {
2126
  "epoch": 0.9696,
2127
- "grad_norm": 0.6271975040435791,
2128
  "learning_rate": 7.124410509768695e-05,
2129
- "loss": 0.0327,
2130
  "step": 3030
2131
  },
2132
  {
2133
  "epoch": 0.9728,
2134
- "grad_norm": 0.5951416492462158,
2135
  "learning_rate": 7.11318212441051e-05,
2136
- "loss": 0.0325,
2137
  "step": 3040
2138
  },
2139
  {
2140
  "epoch": 0.976,
2141
- "grad_norm": 0.6794223785400391,
2142
  "learning_rate": 7.101953739052325e-05,
2143
- "loss": 0.0336,
2144
  "step": 3050
2145
  },
2146
  {
2147
  "epoch": 0.9792,
2148
- "grad_norm": 1.084647536277771,
2149
  "learning_rate": 7.090725353694139e-05,
2150
- "loss": 0.035,
2151
  "step": 3060
2152
  },
2153
  {
2154
  "epoch": 0.9824,
2155
- "grad_norm": 0.40548598766326904,
2156
  "learning_rate": 7.079496968335954e-05,
2157
- "loss": 0.0277,
2158
  "step": 3070
2159
  },
2160
  {
2161
  "epoch": 0.9856,
2162
- "grad_norm": 0.6343255043029785,
2163
  "learning_rate": 7.068268582977768e-05,
2164
- "loss": 0.0282,
2165
  "step": 3080
2166
  },
2167
  {
2168
  "epoch": 0.9888,
2169
- "grad_norm": 0.53138667345047,
2170
  "learning_rate": 7.057040197619582e-05,
2171
- "loss": 0.032,
2172
  "step": 3090
2173
  },
2174
  {
2175
  "epoch": 0.992,
2176
- "grad_norm": 0.7178220748901367,
2177
  "learning_rate": 7.045811812261397e-05,
2178
- "loss": 0.0323,
2179
  "step": 3100
2180
  },
2181
  {
2182
  "epoch": 0.9952,
2183
- "grad_norm": 0.5384820103645325,
2184
  "learning_rate": 7.034583426903212e-05,
2185
- "loss": 0.0319,
2186
  "step": 3110
2187
  },
2188
  {
2189
  "epoch": 0.9984,
2190
- "grad_norm": 1.4491897821426392,
2191
  "learning_rate": 7.023355041545027e-05,
2192
- "loss": 0.0345,
2193
  "step": 3120
2194
  }
2195
  ],
 
12
  "epoch": 0.0032,
13
  "grad_norm": 10.0,
14
  "learning_rate": 2.132196162046908e-06,
15
+ "loss": 25.0099,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.0064,
20
+ "grad_norm": 10.0,
21
  "learning_rate": 4.264392324093816e-06,
22
+ "loss": 21.14,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.0096,
27
+ "grad_norm": 10.0,
28
  "learning_rate": 6.396588486140726e-06,
29
+ "loss": 19.5656,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.0128,
34
+ "grad_norm": 10.0,
35
  "learning_rate": 8.528784648187633e-06,
36
+ "loss": 22.2556,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.016,
41
+ "grad_norm": 9.999999046325684,
42
  "learning_rate": 1.0660980810234541e-05,
43
+ "loss": 19.8975,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.0192,
48
  "grad_norm": 10.0,
49
  "learning_rate": 1.2793176972281452e-05,
50
+ "loss": 19.9186,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.0224,
55
  "grad_norm": 10.0,
56
  "learning_rate": 1.4925373134328357e-05,
57
+ "loss": 19.9392,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.0256,
62
+ "grad_norm": 10.0,
63
  "learning_rate": 1.7057569296375266e-05,
64
+ "loss": 18.3684,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.0288,
69
+ "grad_norm": 9.999998092651367,
70
  "learning_rate": 1.9189765458422178e-05,
71
+ "loss": 16.6577,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.032,
76
  "grad_norm": 9.999999046325684,
77
  "learning_rate": 2.1321961620469083e-05,
78
+ "loss": 13.9967,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.0352,
83
+ "grad_norm": 10.0,
84
  "learning_rate": 2.345415778251599e-05,
85
+ "loss": 15.5673,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.0384,
90
+ "grad_norm": 9.999999046325684,
91
  "learning_rate": 2.5586353944562904e-05,
92
+ "loss": 12.1718,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.0416,
97
  "grad_norm": 10.0,
98
  "learning_rate": 2.771855010660981e-05,
99
+ "loss": 11.1047,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.0448,
104
+ "grad_norm": 10.0,
105
  "learning_rate": 2.9850746268656714e-05,
106
+ "loss": 10.4134,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.048,
111
+ "grad_norm": 10.0,
112
  "learning_rate": 3.1982942430703626e-05,
113
+ "loss": 10.6204,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.0512,
118
  "grad_norm": 9.999999046325684,
119
  "learning_rate": 3.411513859275053e-05,
120
+ "loss": 7.4005,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.0544,
125
+ "grad_norm": 9.999998092651367,
126
  "learning_rate": 3.624733475479744e-05,
127
+ "loss": 6.7938,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.0576,
132
+ "grad_norm": 9.999999046325684,
133
  "learning_rate": 3.8379530916844355e-05,
134
+ "loss": 5.8487,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.0608,
139
+ "grad_norm": 9.999999046325684,
140
  "learning_rate": 4.051172707889126e-05,
141
+ "loss": 5.6155,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.064,
146
+ "grad_norm": 9.999998092651367,
147
  "learning_rate": 4.2643923240938166e-05,
148
+ "loss": 4.978,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.0672,
153
+ "grad_norm": 9.999999046325684,
154
  "learning_rate": 4.477611940298508e-05,
155
+ "loss": 3.7961,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.0704,
160
+ "grad_norm": 9.999999046325684,
161
  "learning_rate": 4.690831556503198e-05,
162
+ "loss": 3.2914,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.0736,
167
+ "grad_norm": 8.79616641998291,
168
  "learning_rate": 4.904051172707889e-05,
169
+ "loss": 3.3423,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.0768,
174
+ "grad_norm": 8.134244918823242,
175
  "learning_rate": 5.117270788912581e-05,
176
+ "loss": 2.5389,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.08,
181
+ "grad_norm": 8.448352813720703,
182
  "learning_rate": 5.330490405117271e-05,
183
+ "loss": 2.2754,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.0832,
188
+ "grad_norm": 5.989101409912109,
189
  "learning_rate": 5.543710021321962e-05,
190
+ "loss": 2.0337,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.0864,
195
+ "grad_norm": 5.194303512573242,
196
  "learning_rate": 5.756929637526652e-05,
197
+ "loss": 1.8128,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 0.0896,
202
+ "grad_norm": 4.9008893966674805,
203
  "learning_rate": 5.970149253731343e-05,
204
+ "loss": 1.8,
205
  "step": 280
206
  },
207
  {
208
  "epoch": 0.0928,
209
+ "grad_norm": 5.970029354095459,
210
  "learning_rate": 6.183368869936035e-05,
211
+ "loss": 1.6032,
212
  "step": 290
213
  },
214
  {
215
  "epoch": 0.096,
216
+ "grad_norm": 5.243438720703125,
217
  "learning_rate": 6.396588486140725e-05,
218
+ "loss": 1.7633,
219
  "step": 300
220
  },
221
  {
222
  "epoch": 0.0992,
223
+ "grad_norm": 4.298349857330322,
224
  "learning_rate": 6.609808102345416e-05,
225
+ "loss": 1.5381,
226
  "step": 310
227
  },
228
  {
229
  "epoch": 0.1024,
230
+ "grad_norm": 4.325716495513916,
231
  "learning_rate": 6.823027718550106e-05,
232
+ "loss": 1.4,
233
  "step": 320
234
  },
235
  {
236
  "epoch": 0.1056,
237
+ "grad_norm": 4.040591716766357,
238
  "learning_rate": 7.036247334754798e-05,
239
+ "loss": 1.485,
240
  "step": 330
241
  },
242
  {
243
  "epoch": 0.1088,
244
+ "grad_norm": 4.4948015213012695,
245
  "learning_rate": 7.249466950959489e-05,
246
+ "loss": 1.3199,
247
  "step": 340
248
  },
249
  {
250
  "epoch": 0.112,
251
+ "grad_norm": 4.134742736816406,
252
  "learning_rate": 7.46268656716418e-05,
253
+ "loss": 1.3535,
254
  "step": 350
255
  },
256
  {
257
  "epoch": 0.1152,
258
+ "grad_norm": 3.762752056121826,
259
  "learning_rate": 7.675906183368871e-05,
260
+ "loss": 1.2331,
261
  "step": 360
262
  },
263
  {
264
  "epoch": 0.1184,
265
+ "grad_norm": 4.193824291229248,
266
  "learning_rate": 7.889125799573562e-05,
267
+ "loss": 1.2674,
268
  "step": 370
269
  },
270
  {
271
  "epoch": 0.1216,
272
+ "grad_norm": 3.6042652130126953,
273
  "learning_rate": 8.102345415778252e-05,
274
+ "loss": 1.2238,
275
  "step": 380
276
  },
277
  {
278
  "epoch": 0.1248,
279
+ "grad_norm": 3.8218977451324463,
280
  "learning_rate": 8.315565031982943e-05,
281
+ "loss": 1.18,
282
  "step": 390
283
  },
284
  {
285
  "epoch": 0.128,
286
+ "grad_norm": 4.003774166107178,
287
  "learning_rate": 8.528784648187633e-05,
288
+ "loss": 1.2161,
289
  "step": 400
290
  },
291
  {
292
  "epoch": 0.1312,
293
+ "grad_norm": 3.954782247543335,
294
  "learning_rate": 8.742004264392325e-05,
295
+ "loss": 1.1235,
296
  "step": 410
297
  },
298
  {
299
  "epoch": 0.1344,
300
+ "grad_norm": 4.31278657913208,
301
  "learning_rate": 8.955223880597016e-05,
302
+ "loss": 1.0786,
303
  "step": 420
304
  },
305
  {
306
  "epoch": 0.1376,
307
+ "grad_norm": 3.7707276344299316,
308
  "learning_rate": 9.168443496801706e-05,
309
+ "loss": 1.0585,
310
  "step": 430
311
  },
312
  {
313
  "epoch": 0.1408,
314
+ "grad_norm": 5.498402118682861,
315
  "learning_rate": 9.381663113006397e-05,
316
+ "loss": 1.0594,
317
  "step": 440
318
  },
319
  {
320
  "epoch": 0.144,
321
+ "grad_norm": 4.374643802642822,
322
  "learning_rate": 9.594882729211087e-05,
323
+ "loss": 1.0838,
324
  "step": 450
325
  },
326
  {
327
  "epoch": 0.1472,
328
+ "grad_norm": 4.22064733505249,
329
  "learning_rate": 9.808102345415778e-05,
330
+ "loss": 1.0391,
331
  "step": 460
332
  },
333
  {
334
  "epoch": 0.1504,
335
+ "grad_norm": 3.9745569229125977,
336
  "learning_rate": 9.998877161464182e-05,
337
+ "loss": 0.8823,
338
  "step": 470
339
  },
340
  {
341
  "epoch": 0.1536,
342
+ "grad_norm": 3.797243595123291,
343
  "learning_rate": 9.987648776105997e-05,
344
+ "loss": 0.889,
345
  "step": 480
346
  },
347
  {
348
  "epoch": 0.1568,
349
+ "grad_norm": 3.7148423194885254,
350
  "learning_rate": 9.97642039074781e-05,
351
+ "loss": 0.8886,
352
  "step": 490
353
  },
354
  {
355
  "epoch": 0.16,
356
+ "grad_norm": 3.3785812854766846,
357
  "learning_rate": 9.965192005389625e-05,
358
+ "loss": 0.9073,
359
  "step": 500
360
  },
361
  {
362
  "epoch": 0.1632,
363
+ "grad_norm": 3.5189311504364014,
364
  "learning_rate": 9.95396362003144e-05,
365
+ "loss": 0.8472,
366
  "step": 510
367
  },
368
  {
369
  "epoch": 0.1664,
370
+ "grad_norm": 4.278319835662842,
371
  "learning_rate": 9.942735234673256e-05,
372
+ "loss": 0.8266,
373
  "step": 520
374
  },
375
  {
376
  "epoch": 0.1696,
377
+ "grad_norm": 2.666747570037842,
378
  "learning_rate": 9.931506849315069e-05,
379
+ "loss": 0.7793,
380
  "step": 530
381
  },
382
  {
383
  "epoch": 0.1728,
384
+ "grad_norm": 6.504436016082764,
385
  "learning_rate": 9.920278463956883e-05,
386
+ "loss": 0.8237,
387
  "step": 540
388
  },
389
  {
390
  "epoch": 0.176,
391
+ "grad_norm": 3.135028839111328,
392
  "learning_rate": 9.909050078598698e-05,
393
+ "loss": 0.771,
394
  "step": 550
395
  },
396
  {
397
  "epoch": 0.1792,
398
+ "grad_norm": 3.2984988689422607,
399
  "learning_rate": 9.897821693240512e-05,
400
+ "loss": 0.7429,
401
  "step": 560
402
  },
403
  {
404
  "epoch": 0.1824,
405
+ "grad_norm": 4.183766841888428,
406
  "learning_rate": 9.886593307882327e-05,
407
+ "loss": 0.788,
408
  "step": 570
409
  },
410
  {
411
  "epoch": 0.1856,
412
+ "grad_norm": 3.8883821964263916,
413
  "learning_rate": 9.875364922524142e-05,
414
+ "loss": 0.8378,
415
  "step": 580
416
  },
417
  {
418
  "epoch": 0.1888,
419
+ "grad_norm": 3.9806365966796875,
420
  "learning_rate": 9.864136537165956e-05,
421
+ "loss": 0.7939,
422
  "step": 590
423
  },
424
  {
425
  "epoch": 0.192,
426
+ "grad_norm": 3.4956815242767334,
427
  "learning_rate": 9.852908151807771e-05,
428
+ "loss": 0.7603,
429
  "step": 600
430
  },
431
  {
432
  "epoch": 0.1952,
433
+ "grad_norm": 5.5648627281188965,
434
  "learning_rate": 9.841679766449586e-05,
435
+ "loss": 0.7235,
436
  "step": 610
437
  },
438
  {
439
  "epoch": 0.1984,
440
+ "grad_norm": 3.4344027042388916,
441
  "learning_rate": 9.8304513810914e-05,
442
+ "loss": 0.6992,
443
  "step": 620
444
  },
445
  {
446
  "epoch": 0.2016,
447
+ "grad_norm": 3.3964436054229736,
448
  "learning_rate": 9.819222995733213e-05,
449
+ "loss": 0.6704,
450
  "step": 630
451
  },
452
  {
453
  "epoch": 0.2048,
454
+ "grad_norm": 4.435855388641357,
455
  "learning_rate": 9.807994610375028e-05,
456
+ "loss": 0.6606,
457
  "step": 640
458
  },
459
  {
460
  "epoch": 0.208,
461
+ "grad_norm": 3.452620029449463,
462
  "learning_rate": 9.796766225016843e-05,
463
+ "loss": 0.6262,
464
  "step": 650
465
  },
466
  {
467
  "epoch": 0.2112,
468
+ "grad_norm": 3.708383798599243,
469
  "learning_rate": 9.785537839658657e-05,
470
+ "loss": 0.6535,
471
  "step": 660
472
  },
473
  {
474
  "epoch": 0.2144,
475
+ "grad_norm": 4.044270038604736,
476
  "learning_rate": 9.774309454300472e-05,
477
+ "loss": 0.5877,
478
  "step": 670
479
  },
480
  {
481
  "epoch": 0.2176,
482
+ "grad_norm": 3.3694796562194824,
483
  "learning_rate": 9.763081068942287e-05,
484
+ "loss": 0.5658,
485
  "step": 680
486
  },
487
  {
488
  "epoch": 0.2208,
489
+ "grad_norm": 3.8326478004455566,
490
  "learning_rate": 9.751852683584101e-05,
491
+ "loss": 0.5767,
492
  "step": 690
493
  },
494
  {
495
  "epoch": 0.224,
496
+ "grad_norm": 3.358949899673462,
497
  "learning_rate": 9.740624298225916e-05,
498
+ "loss": 0.5449,
499
  "step": 700
500
  },
501
  {
502
  "epoch": 0.2272,
503
+ "grad_norm": 4.013234615325928,
504
  "learning_rate": 9.729395912867731e-05,
505
+ "loss": 0.5099,
506
  "step": 710
507
  },
508
  {
509
  "epoch": 0.2304,
510
+ "grad_norm": 3.565805435180664,
511
  "learning_rate": 9.718167527509545e-05,
512
+ "loss": 0.5736,
513
  "step": 720
514
  },
515
  {
516
  "epoch": 0.2336,
517
+ "grad_norm": 3.975719690322876,
518
  "learning_rate": 9.706939142151358e-05,
519
+ "loss": 0.5294,
520
  "step": 730
521
  },
522
  {
523
  "epoch": 0.2368,
524
+ "grad_norm": 6.230079650878906,
525
  "learning_rate": 9.695710756793174e-05,
526
+ "loss": 0.5179,
527
  "step": 740
528
  },
529
  {
530
  "epoch": 0.24,
531
+ "grad_norm": 3.283184051513672,
532
  "learning_rate": 9.684482371434989e-05,
533
+ "loss": 0.496,
534
  "step": 750
535
  },
536
  {
537
  "epoch": 0.2432,
538
+ "grad_norm": 3.611090660095215,
539
  "learning_rate": 9.673253986076802e-05,
540
+ "loss": 0.5017,
541
  "step": 760
542
  },
543
  {
544
  "epoch": 0.2464,
545
+ "grad_norm": 6.290208339691162,
546
  "learning_rate": 9.662025600718617e-05,
547
+ "loss": 0.4952,
548
  "step": 770
549
  },
550
  {
551
  "epoch": 0.2496,
552
+ "grad_norm": 4.165639400482178,
553
  "learning_rate": 9.650797215360432e-05,
554
+ "loss": 0.4312,
555
  "step": 780
556
  },
557
  {
558
  "epoch": 0.2528,
559
+ "grad_norm": 3.7825958728790283,
560
  "learning_rate": 9.639568830002246e-05,
561
+ "loss": 0.412,
562
  "step": 790
563
  },
564
  {
565
  "epoch": 0.256,
566
+ "grad_norm": 6.383756637573242,
567
  "learning_rate": 9.628340444644061e-05,
568
+ "loss": 0.4097,
569
  "step": 800
570
  },
571
  {
572
  "epoch": 0.2592,
573
+ "grad_norm": 2.7888543605804443,
574
  "learning_rate": 9.617112059285875e-05,
575
+ "loss": 0.3707,
576
  "step": 810
577
  },
578
  {
579
  "epoch": 0.2624,
580
+ "grad_norm": 1.9508731365203857,
581
  "learning_rate": 9.605883673927689e-05,
582
+ "loss": 0.3584,
583
  "step": 820
584
  },
585
  {
586
  "epoch": 0.2656,
587
+ "grad_norm": 3.938863515853882,
588
  "learning_rate": 9.594655288569504e-05,
589
+ "loss": 0.3721,
590
  "step": 830
591
  },
592
  {
593
  "epoch": 0.2688,
594
+ "grad_norm": 2.89081072807312,
595
  "learning_rate": 9.583426903211319e-05,
596
+ "loss": 0.3412,
597
  "step": 840
598
  },
599
  {
600
  "epoch": 0.272,
601
+ "grad_norm": 2.9330146312713623,
602
  "learning_rate": 9.572198517853134e-05,
603
+ "loss": 0.2764,
604
  "step": 850
605
  },
606
  {
607
  "epoch": 0.2752,
608
+ "grad_norm": 2.5014054775238037,
609
  "learning_rate": 9.560970132494948e-05,
610
+ "loss": 0.2725,
611
  "step": 860
612
  },
613
  {
614
  "epoch": 0.2784,
615
+ "grad_norm": 2.38750958442688,
616
  "learning_rate": 9.549741747136763e-05,
617
+ "loss": 0.2933,
618
  "step": 870
619
  },
620
  {
621
  "epoch": 0.2816,
622
+ "grad_norm": 3.248847246170044,
623
  "learning_rate": 9.538513361778578e-05,
624
+ "loss": 0.309,
625
  "step": 880
626
  },
627
  {
628
  "epoch": 0.2848,
629
+ "grad_norm": 4.140264987945557,
630
  "learning_rate": 9.527284976420391e-05,
631
+ "loss": 0.2656,
632
  "step": 890
633
  },
634
  {
635
  "epoch": 0.288,
636
+ "grad_norm": 2.2702248096466064,
637
  "learning_rate": 9.516056591062205e-05,
638
+ "loss": 0.2928,
639
  "step": 900
640
  },
641
  {
642
  "epoch": 0.2912,
643
+ "grad_norm": 5.701258659362793,
644
  "learning_rate": 9.50482820570402e-05,
645
+ "loss": 0.267,
646
  "step": 910
647
  },
648
  {
649
  "epoch": 0.2944,
650
+ "grad_norm": 2.0174479484558105,
651
  "learning_rate": 9.493599820345834e-05,
652
+ "loss": 0.2388,
653
  "step": 920
654
  },
655
  {
656
  "epoch": 0.2976,
657
+ "grad_norm": 2.4099040031433105,
658
  "learning_rate": 9.482371434987649e-05,
659
+ "loss": 0.2303,
660
  "step": 930
661
  },
662
  {
663
  "epoch": 0.3008,
664
+ "grad_norm": 2.7431418895721436,
665
  "learning_rate": 9.471143049629464e-05,
666
+ "loss": 0.2389,
667
  "step": 940
668
  },
669
  {
670
  "epoch": 0.304,
671
+ "grad_norm": 1.7483993768692017,
672
  "learning_rate": 9.459914664271278e-05,
673
+ "loss": 0.2555,
674
  "step": 950
675
  },
676
  {
677
  "epoch": 0.3072,
678
+ "grad_norm": 2.9412496089935303,
679
  "learning_rate": 9.448686278913093e-05,
680
+ "loss": 0.2189,
681
  "step": 960
682
  },
683
  {
684
  "epoch": 0.3104,
685
+ "grad_norm": 2.591343402862549,
686
  "learning_rate": 9.437457893554908e-05,
687
+ "loss": 0.1972,
688
  "step": 970
689
  },
690
  {
691
  "epoch": 0.3136,
692
+ "grad_norm": 2.46079421043396,
693
  "learning_rate": 9.426229508196722e-05,
694
+ "loss": 0.1959,
695
  "step": 980
696
  },
697
  {
698
  "epoch": 0.3168,
699
+ "grad_norm": 3.11035418510437,
700
  "learning_rate": 9.415001122838537e-05,
701
+ "loss": 0.1952,
702
  "step": 990
703
  },
704
  {
705
  "epoch": 0.32,
706
+ "grad_norm": 1.9760693311691284,
707
  "learning_rate": 9.40377273748035e-05,
708
+ "loss": 0.1732,
709
  "step": 1000
710
  },
711
  {
712
  "epoch": 0.3232,
713
+ "grad_norm": 1.6762940883636475,
714
  "learning_rate": 9.392544352122165e-05,
715
+ "loss": 0.1591,
716
  "step": 1010
717
  },
718
  {
719
  "epoch": 0.3264,
720
+ "grad_norm": 2.380981922149658,
721
  "learning_rate": 9.381315966763979e-05,
722
+ "loss": 0.1799,
723
  "step": 1020
724
  },
725
  {
726
  "epoch": 0.3296,
727
+ "grad_norm": 3.2577524185180664,
728
  "learning_rate": 9.370087581405794e-05,
729
+ "loss": 0.1994,
730
  "step": 1030
731
  },
732
  {
733
  "epoch": 0.3328,
734
+ "grad_norm": 1.5915032625198364,
735
  "learning_rate": 9.358859196047609e-05,
736
+ "loss": 0.1466,
737
  "step": 1040
738
  },
739
  {
740
  "epoch": 0.336,
741
+ "grad_norm": 1.7364068031311035,
742
  "learning_rate": 9.347630810689423e-05,
743
+ "loss": 0.164,
744
  "step": 1050
745
  },
746
  {
747
  "epoch": 0.3392,
748
+ "grad_norm": 2.3017282485961914,
749
  "learning_rate": 9.336402425331238e-05,
750
+ "loss": 0.1564,
751
  "step": 1060
752
  },
753
  {
754
  "epoch": 0.3424,
755
+ "grad_norm": 1.5479563474655151,
756
  "learning_rate": 9.325174039973053e-05,
757
+ "loss": 0.1413,
758
  "step": 1070
759
  },
760
  {
761
  "epoch": 0.3456,
762
+ "grad_norm": 2.007779836654663,
763
  "learning_rate": 9.313945654614867e-05,
764
+ "loss": 0.143,
765
  "step": 1080
766
  },
767
  {
768
  "epoch": 0.3488,
769
+ "grad_norm": 1.2506276369094849,
770
  "learning_rate": 9.30271726925668e-05,
771
+ "loss": 0.1297,
772
  "step": 1090
773
  },
774
  {
775
  "epoch": 0.352,
776
+ "grad_norm": 1.4856570959091187,
777
  "learning_rate": 9.291488883898496e-05,
778
+ "loss": 0.1355,
779
  "step": 1100
780
  },
781
  {
782
  "epoch": 0.3552,
783
+ "grad_norm": 2.262418270111084,
784
  "learning_rate": 9.280260498540311e-05,
785
+ "loss": 0.1374,
786
  "step": 1110
787
  },
788
  {
789
  "epoch": 0.3584,
790
+ "grad_norm": 1.535651683807373,
791
  "learning_rate": 9.269032113182124e-05,
792
+ "loss": 0.1331,
793
  "step": 1120
794
  },
795
  {
796
  "epoch": 0.3616,
797
+ "grad_norm": 1.3784823417663574,
798
  "learning_rate": 9.25780372782394e-05,
799
+ "loss": 0.1495,
800
  "step": 1130
801
  },
802
  {
803
  "epoch": 0.3648,
804
+ "grad_norm": 1.548283338546753,
805
  "learning_rate": 9.246575342465755e-05,
806
+ "loss": 0.1194,
807
  "step": 1140
808
  },
809
  {
810
  "epoch": 0.368,
811
+ "grad_norm": 2.1643035411834717,
812
  "learning_rate": 9.235346957107568e-05,
813
+ "loss": 0.1285,
814
  "step": 1150
815
  },
816
  {
817
  "epoch": 0.3712,
818
+ "grad_norm": 1.8928327560424805,
819
  "learning_rate": 9.224118571749383e-05,
820
+ "loss": 0.1189,
821
  "step": 1160
822
  },
823
  {
824
  "epoch": 0.3744,
825
+ "grad_norm": 1.3566118478775024,
826
  "learning_rate": 9.212890186391197e-05,
827
+ "loss": 0.1165,
828
  "step": 1170
829
  },
830
  {
831
  "epoch": 0.3776,
832
+ "grad_norm": 1.5916873216629028,
833
  "learning_rate": 9.201661801033011e-05,
834
+ "loss": 0.1129,
835
  "step": 1180
836
  },
837
  {
838
  "epoch": 0.3808,
839
+ "grad_norm": 2.1428229808807373,
840
  "learning_rate": 9.190433415674826e-05,
841
+ "loss": 0.1176,
842
  "step": 1190
843
  },
844
  {
845
  "epoch": 0.384,
846
+ "grad_norm": 1.2154109477996826,
847
  "learning_rate": 9.179205030316641e-05,
848
+ "loss": 0.1045,
849
  "step": 1200
850
  },
851
  {
852
  "epoch": 0.3872,
853
+ "grad_norm": 1.4908652305603027,
854
  "learning_rate": 9.167976644958456e-05,
855
+ "loss": 0.1061,
856
  "step": 1210
857
  },
858
  {
859
  "epoch": 0.3904,
860
+ "grad_norm": 1.2576334476470947,
861
  "learning_rate": 9.15674825960027e-05,
862
+ "loss": 0.0973,
863
  "step": 1220
864
  },
865
  {
866
  "epoch": 0.3936,
867
+ "grad_norm": 2.442065954208374,
868
  "learning_rate": 9.145519874242085e-05,
869
+ "loss": 0.1057,
870
  "step": 1230
871
  },
872
  {
873
  "epoch": 0.3968,
874
+ "grad_norm": 1.8123060464859009,
875
  "learning_rate": 9.1342914888839e-05,
876
+ "loss": 0.1037,
877
  "step": 1240
878
  },
879
  {
880
  "epoch": 0.4,
881
+ "grad_norm": 3.8450021743774414,
882
  "learning_rate": 9.123063103525713e-05,
883
+ "loss": 0.1042,
884
  "step": 1250
885
  },
886
  {
887
  "epoch": 0.4032,
888
+ "grad_norm": 1.4633779525756836,
889
  "learning_rate": 9.111834718167527e-05,
890
+ "loss": 0.1081,
891
  "step": 1260
892
  },
893
  {
894
  "epoch": 0.4064,
895
+ "grad_norm": 2.0343596935272217,
896
  "learning_rate": 9.100606332809342e-05,
897
+ "loss": 0.0845,
898
  "step": 1270
899
  },
900
  {
901
  "epoch": 0.4096,
902
+ "grad_norm": 2.1787729263305664,
903
  "learning_rate": 9.089377947451156e-05,
904
+ "loss": 0.1067,
905
  "step": 1280
906
  },
907
  {
908
  "epoch": 0.4128,
909
+ "grad_norm": 1.5552949905395508,
910
  "learning_rate": 9.078149562092971e-05,
911
+ "loss": 0.107,
912
  "step": 1290
913
  },
914
  {
915
  "epoch": 0.416,
916
+ "grad_norm": 2.656562566757202,
917
  "learning_rate": 9.066921176734786e-05,
918
+ "loss": 0.0911,
919
  "step": 1300
920
  },
921
  {
922
  "epoch": 0.4192,
923
+ "grad_norm": 1.4937834739685059,
924
  "learning_rate": 9.055692791376601e-05,
925
+ "loss": 0.0992,
926
  "step": 1310
927
  },
928
  {
929
  "epoch": 0.4224,
930
+ "grad_norm": 1.6240930557250977,
931
  "learning_rate": 9.044464406018415e-05,
932
+ "loss": 0.0914,
933
  "step": 1320
934
  },
935
  {
936
  "epoch": 0.4256,
937
+ "grad_norm": 1.9437700510025024,
938
  "learning_rate": 9.03323602066023e-05,
939
+ "loss": 0.0985,
940
  "step": 1330
941
  },
942
  {
943
  "epoch": 0.4288,
944
+ "grad_norm": 1.0026746988296509,
945
  "learning_rate": 9.022007635302045e-05,
946
+ "loss": 0.0917,
947
  "step": 1340
948
  },
949
  {
950
  "epoch": 0.432,
951
+ "grad_norm": 0.9967594742774963,
952
  "learning_rate": 9.010779249943859e-05,
953
+ "loss": 0.0793,
954
  "step": 1350
955
  },
956
  {
957
  "epoch": 0.4352,
958
+ "grad_norm": 0.9784688353538513,
959
  "learning_rate": 8.999550864585672e-05,
960
+ "loss": 0.0826,
961
  "step": 1360
962
  },
963
  {
964
  "epoch": 0.4384,
965
+ "grad_norm": 1.4356553554534912,
966
  "learning_rate": 8.988322479227488e-05,
967
+ "loss": 0.0936,
968
  "step": 1370
969
  },
970
  {
971
  "epoch": 0.4416,
972
+ "grad_norm": 1.7505671977996826,
973
  "learning_rate": 8.977094093869301e-05,
974
+ "loss": 0.0775,
975
  "step": 1380
976
  },
977
  {
978
  "epoch": 0.4448,
979
+ "grad_norm": 1.1895841360092163,
980
  "learning_rate": 8.965865708511116e-05,
981
+ "loss": 0.087,
982
  "step": 1390
983
  },
984
  {
985
  "epoch": 0.448,
986
+ "grad_norm": 1.9759968519210815,
987
  "learning_rate": 8.954637323152931e-05,
988
+ "loss": 0.0777,
989
  "step": 1400
990
  },
991
  {
992
  "epoch": 0.4512,
993
+ "grad_norm": 1.276041865348816,
994
  "learning_rate": 8.943408937794746e-05,
995
+ "loss": 0.0937,
996
  "step": 1410
997
  },
998
  {
999
  "epoch": 0.4544,
1000
+ "grad_norm": 1.0501399040222168,
1001
  "learning_rate": 8.93218055243656e-05,
1002
+ "loss": 0.0821,
1003
  "step": 1420
1004
  },
1005
  {
1006
  "epoch": 0.4576,
1007
+ "grad_norm": 0.9872506856918335,
1008
  "learning_rate": 8.920952167078375e-05,
1009
+ "loss": 0.076,
1010
  "step": 1430
1011
  },
1012
  {
1013
  "epoch": 0.4608,
1014
+ "grad_norm": 1.2520620822906494,
1015
  "learning_rate": 8.909723781720189e-05,
1016
+ "loss": 0.0719,
1017
  "step": 1440
1018
  },
1019
  {
1020
  "epoch": 0.464,
1021
+ "grad_norm": 1.0118720531463623,
1022
  "learning_rate": 8.898495396362003e-05,
1023
+ "loss": 0.0712,
1024
  "step": 1450
1025
  },
1026
  {
1027
  "epoch": 0.4672,
1028
+ "grad_norm": 1.3171415328979492,
1029
  "learning_rate": 8.887267011003818e-05,
1030
+ "loss": 0.0701,
1031
  "step": 1460
1032
  },
1033
  {
1034
  "epoch": 0.4704,
1035
+ "grad_norm": 1.0808926820755005,
1036
  "learning_rate": 8.876038625645633e-05,
1037
+ "loss": 0.0754,
1038
  "step": 1470
1039
  },
1040
  {
1041
  "epoch": 0.4736,
1042
+ "grad_norm": 2.9019877910614014,
1043
  "learning_rate": 8.864810240287447e-05,
1044
+ "loss": 0.0767,
1045
  "step": 1480
1046
  },
1047
  {
1048
  "epoch": 0.4768,
1049
+ "grad_norm": 1.3460652828216553,
1050
  "learning_rate": 8.853581854929262e-05,
1051
+ "loss": 0.0782,
1052
  "step": 1490
1053
  },
1054
  {
1055
  "epoch": 0.48,
1056
+ "grad_norm": 1.6336652040481567,
1057
  "learning_rate": 8.842353469571077e-05,
1058
+ "loss": 0.0723,
1059
  "step": 1500
1060
  },
1061
  {
1062
  "epoch": 0.4832,
1063
+ "grad_norm": 0.9963213205337524,
1064
  "learning_rate": 8.83112508421289e-05,
1065
+ "loss": 0.068,
1066
  "step": 1510
1067
  },
1068
  {
1069
  "epoch": 0.4864,
1070
+ "grad_norm": 1.618598222732544,
1071
  "learning_rate": 8.819896698854705e-05,
1072
+ "loss": 0.0773,
1073
  "step": 1520
1074
  },
1075
  {
1076
  "epoch": 0.4896,
1077
+ "grad_norm": 0.9730167984962463,
1078
  "learning_rate": 8.808668313496519e-05,
1079
+ "loss": 0.0633,
1080
  "step": 1530
1081
  },
1082
  {
1083
  "epoch": 0.4928,
1084
+ "grad_norm": 1.4831032752990723,
1085
  "learning_rate": 8.797439928138334e-05,
1086
+ "loss": 0.0641,
1087
  "step": 1540
1088
  },
1089
  {
1090
  "epoch": 0.496,
1091
+ "grad_norm": 0.9782041311264038,
1092
  "learning_rate": 8.786211542780148e-05,
1093
+ "loss": 0.0758,
1094
  "step": 1550
1095
  },
1096
  {
1097
  "epoch": 0.4992,
1098
+ "grad_norm": 1.8618321418762207,
1099
  "learning_rate": 8.774983157421963e-05,
1100
+ "loss": 0.0657,
1101
  "step": 1560
1102
  },
1103
  {
1104
  "epoch": 0.5024,
1105
+ "grad_norm": 1.637278437614441,
1106
  "learning_rate": 8.763754772063778e-05,
1107
+ "loss": 0.0615,
1108
  "step": 1570
1109
  },
1110
  {
1111
  "epoch": 0.5056,
1112
+ "grad_norm": 0.9550865292549133,
1113
  "learning_rate": 8.752526386705592e-05,
1114
+ "loss": 0.0614,
1115
  "step": 1580
1116
  },
1117
  {
1118
  "epoch": 0.5088,
1119
+ "grad_norm": 2.0610525608062744,
1120
  "learning_rate": 8.741298001347407e-05,
1121
+ "loss": 0.0657,
1122
  "step": 1590
1123
  },
1124
  {
1125
  "epoch": 0.512,
1126
+ "grad_norm": 0.7099101543426514,
1127
  "learning_rate": 8.730069615989222e-05,
1128
+ "loss": 0.0631,
1129
  "step": 1600
1130
  },
1131
  {
1132
  "epoch": 0.5152,
1133
+ "grad_norm": 1.1444408893585205,
1134
  "learning_rate": 8.718841230631036e-05,
1135
+ "loss": 0.0601,
1136
  "step": 1610
1137
  },
1138
  {
1139
  "epoch": 0.5184,
1140
+ "grad_norm": 2.0309245586395264,
1141
  "learning_rate": 8.70761284527285e-05,
1142
+ "loss": 0.0564,
1143
  "step": 1620
1144
  },
1145
  {
1146
  "epoch": 0.5216,
1147
+ "grad_norm": 1.3110053539276123,
1148
  "learning_rate": 8.696384459914664e-05,
1149
+ "loss": 0.0637,
1150
  "step": 1630
1151
  },
1152
  {
1153
  "epoch": 0.5248,
1154
+ "grad_norm": 1.2481273412704468,
1155
  "learning_rate": 8.68515607455648e-05,
1156
+ "loss": 0.0562,
1157
  "step": 1640
1158
  },
1159
  {
1160
  "epoch": 0.528,
1161
+ "grad_norm": 1.6358749866485596,
1162
  "learning_rate": 8.673927689198293e-05,
1163
+ "loss": 0.0612,
1164
  "step": 1650
1165
  },
1166
  {
1167
  "epoch": 0.5312,
1168
+ "grad_norm": 1.46366548538208,
1169
  "learning_rate": 8.662699303840108e-05,
1170
+ "loss": 0.0603,
1171
  "step": 1660
1172
  },
1173
  {
1174
  "epoch": 0.5344,
1175
+ "grad_norm": 0.9687389731407166,
1176
  "learning_rate": 8.651470918481923e-05,
1177
+ "loss": 0.055,
1178
  "step": 1670
1179
  },
1180
  {
1181
  "epoch": 0.5376,
1182
+ "grad_norm": 0.8391016125679016,
1183
  "learning_rate": 8.640242533123737e-05,
1184
+ "loss": 0.0545,
1185
  "step": 1680
1186
  },
1187
  {
1188
  "epoch": 0.5408,
1189
+ "grad_norm": 1.6513214111328125,
1190
  "learning_rate": 8.629014147765552e-05,
1191
+ "loss": 0.0581,
1192
  "step": 1690
1193
  },
1194
  {
1195
  "epoch": 0.544,
1196
+ "grad_norm": 0.5965524315834045,
1197
  "learning_rate": 8.617785762407367e-05,
1198
+ "loss": 0.0556,
1199
  "step": 1700
1200
  },
1201
  {
1202
  "epoch": 0.5472,
1203
+ "grad_norm": 0.8696075677871704,
1204
  "learning_rate": 8.606557377049181e-05,
1205
+ "loss": 0.0608,
1206
  "step": 1710
1207
  },
1208
  {
1209
  "epoch": 0.5504,
1210
+ "grad_norm": 1.1388493776321411,
1211
  "learning_rate": 8.595328991690995e-05,
1212
+ "loss": 0.0526,
1213
  "step": 1720
1214
  },
1215
  {
1216
  "epoch": 0.5536,
1217
+ "grad_norm": 0.9238974452018738,
1218
  "learning_rate": 8.58410060633281e-05,
1219
+ "loss": 0.0545,
1220
  "step": 1730
1221
  },
1222
  {
1223
  "epoch": 0.5568,
1224
+ "grad_norm": 1.38905930519104,
1225
  "learning_rate": 8.572872220974623e-05,
1226
+ "loss": 0.0585,
1227
  "step": 1740
1228
  },
1229
  {
1230
  "epoch": 0.56,
1231
+ "grad_norm": 1.6556098461151123,
1232
  "learning_rate": 8.561643835616438e-05,
1233
+ "loss": 0.054,
1234
  "step": 1750
1235
  },
1236
  {
1237
  "epoch": 0.5632,
1238
+ "grad_norm": 0.9753937721252441,
1239
  "learning_rate": 8.550415450258253e-05,
1240
+ "loss": 0.0489,
1241
  "step": 1760
1242
  },
1243
  {
1244
  "epoch": 0.5664,
1245
+ "grad_norm": 1.2688230276107788,
1246
  "learning_rate": 8.539187064900069e-05,
1247
+ "loss": 0.0496,
1248
  "step": 1770
1249
  },
1250
  {
1251
  "epoch": 0.5696,
1252
+ "grad_norm": 1.6871953010559082,
1253
  "learning_rate": 8.527958679541882e-05,
1254
+ "loss": 0.052,
1255
  "step": 1780
1256
  },
1257
  {
1258
  "epoch": 0.5728,
1259
+ "grad_norm": 1.1844048500061035,
1260
  "learning_rate": 8.516730294183697e-05,
1261
+ "loss": 0.0544,
1262
  "step": 1790
1263
  },
1264
  {
1265
  "epoch": 0.576,
1266
+ "grad_norm": 0.7625532746315002,
1267
  "learning_rate": 8.505501908825511e-05,
1268
+ "loss": 0.0501,
1269
  "step": 1800
1270
  },
1271
  {
1272
  "epoch": 0.5792,
1273
+ "grad_norm": 0.9352998733520508,
1274
  "learning_rate": 8.494273523467325e-05,
1275
+ "loss": 0.0517,
1276
  "step": 1810
1277
  },
1278
  {
1279
  "epoch": 0.5824,
1280
+ "grad_norm": 2.186729669570923,
1281
  "learning_rate": 8.48304513810914e-05,
1282
+ "loss": 0.0565,
1283
  "step": 1820
1284
  },
1285
  {
1286
  "epoch": 0.5856,
1287
+ "grad_norm": 0.6474363803863525,
1288
  "learning_rate": 8.471816752750955e-05,
1289
+ "loss": 0.0512,
1290
  "step": 1830
1291
  },
1292
  {
1293
  "epoch": 0.5888,
1294
+ "grad_norm": 1.0736746788024902,
1295
  "learning_rate": 8.460588367392769e-05,
1296
+ "loss": 0.0484,
1297
  "step": 1840
1298
  },
1299
  {
1300
  "epoch": 0.592,
1301
+ "grad_norm": 1.6603658199310303,
1302
  "learning_rate": 8.449359982034584e-05,
1303
+ "loss": 0.055,
1304
  "step": 1850
1305
  },
1306
  {
1307
  "epoch": 0.5952,
1308
+ "grad_norm": 0.8258435726165771,
1309
  "learning_rate": 8.438131596676399e-05,
1310
+ "loss": 0.0521,
1311
  "step": 1860
1312
  },
1313
  {
1314
  "epoch": 0.5984,
1315
+ "grad_norm": 0.8024755716323853,
1316
  "learning_rate": 8.426903211318214e-05,
1317
+ "loss": 0.0456,
1318
  "step": 1870
1319
  },
1320
  {
1321
  "epoch": 0.6016,
1322
+ "grad_norm": 1.215813159942627,
1323
  "learning_rate": 8.415674825960028e-05,
1324
+ "loss": 0.0465,
1325
  "step": 1880
1326
  },
1327
  {
1328
  "epoch": 0.6048,
1329
+ "grad_norm": 0.6433802247047424,
1330
  "learning_rate": 8.404446440601843e-05,
1331
+ "loss": 0.0511,
1332
  "step": 1890
1333
  },
1334
  {
1335
  "epoch": 0.608,
1336
+ "grad_norm": 0.5610775947570801,
1337
  "learning_rate": 8.393218055243656e-05,
1338
+ "loss": 0.048,
1339
  "step": 1900
1340
  },
1341
  {
1342
  "epoch": 0.6112,
1343
+ "grad_norm": 1.1887115240097046,
1344
  "learning_rate": 8.38198966988547e-05,
1345
+ "loss": 0.047,
1346
  "step": 1910
1347
  },
1348
  {
1349
  "epoch": 0.6144,
1350
+ "grad_norm": 0.7299818396568298,
1351
  "learning_rate": 8.370761284527285e-05,
1352
+ "loss": 0.0498,
1353
  "step": 1920
1354
  },
1355
  {
1356
  "epoch": 0.6176,
1357
+ "grad_norm": 1.1920089721679688,
1358
  "learning_rate": 8.3595328991691e-05,
1359
+ "loss": 0.0458,
1360
  "step": 1930
1361
  },
1362
  {
1363
  "epoch": 0.6208,
1364
+ "grad_norm": 0.6778120994567871,
1365
  "learning_rate": 8.348304513810914e-05,
1366
+ "loss": 0.0455,
1367
  "step": 1940
1368
  },
1369
  {
1370
  "epoch": 0.624,
1371
+ "grad_norm": 1.1930702924728394,
1372
  "learning_rate": 8.337076128452729e-05,
1373
+ "loss": 0.1967,
1374
  "step": 1950
1375
  },
1376
  {
1377
  "epoch": 0.6272,
1378
+ "grad_norm": 1.2795714139938354,
1379
  "learning_rate": 8.325847743094544e-05,
1380
+ "loss": 0.9457,
1381
  "step": 1960
1382
  },
1383
  {
1384
  "epoch": 0.6304,
1385
+ "grad_norm": 1.2569034099578857,
1386
  "learning_rate": 8.314619357736358e-05,
1387
+ "loss": 0.0661,
1388
  "step": 1970
1389
  },
1390
  {
1391
  "epoch": 0.6336,
1392
+ "grad_norm": 1.6149742603302002,
1393
  "learning_rate": 8.303390972378173e-05,
1394
+ "loss": 0.0981,
1395
  "step": 1980
1396
  },
1397
  {
1398
  "epoch": 0.6368,
1399
+ "grad_norm": 10.0,
1400
  "learning_rate": 8.292162587019986e-05,
1401
+ "loss": 0.0525,
1402
  "step": 1990
1403
  },
1404
  {
1405
  "epoch": 0.64,
1406
+ "grad_norm": 2.119809150695801,
1407
  "learning_rate": 8.280934201661802e-05,
1408
+ "loss": 0.0533,
1409
  "step": 2000
1410
  },
1411
  {
1412
  "epoch": 0.6432,
1413
+ "grad_norm": 1.103276252746582,
1414
  "learning_rate": 8.269705816303615e-05,
1415
+ "loss": 0.0471,
1416
  "step": 2010
1417
  },
1418
  {
1419
  "epoch": 0.6464,
1420
+ "grad_norm": 0.8100212216377258,
1421
  "learning_rate": 8.25847743094543e-05,
1422
+ "loss": 0.0463,
1423
  "step": 2020
1424
  },
1425
  {
1426
  "epoch": 0.6496,
1427
+ "grad_norm": 0.9989579319953918,
1428
  "learning_rate": 8.247249045587245e-05,
1429
+ "loss": 0.044,
1430
  "step": 2030
1431
  },
1432
  {
1433
  "epoch": 0.6528,
1434
+ "grad_norm": 0.8777104616165161,
1435
  "learning_rate": 8.236020660229059e-05,
1436
+ "loss": 0.0477,
1437
  "step": 2040
1438
  },
1439
  {
1440
  "epoch": 0.656,
1441
+ "grad_norm": 0.8403288125991821,
1442
  "learning_rate": 8.224792274870874e-05,
1443
+ "loss": 0.0427,
1444
  "step": 2050
1445
  },
1446
  {
1447
  "epoch": 0.6592,
1448
+ "grad_norm": 0.920377254486084,
1449
  "learning_rate": 8.213563889512689e-05,
1450
+ "loss": 0.0414,
1451
  "step": 2060
1452
  },
1453
  {
1454
  "epoch": 0.6624,
1455
+ "grad_norm": 0.8990156650543213,
1456
  "learning_rate": 8.202335504154503e-05,
1457
+ "loss": 0.0441,
1458
  "step": 2070
1459
  },
1460
  {
1461
  "epoch": 0.6656,
1462
+ "grad_norm": 0.6968216896057129,
1463
  "learning_rate": 8.191107118796317e-05,
1464
+ "loss": 0.0408,
1465
  "step": 2080
1466
  },
1467
  {
1468
  "epoch": 0.6688,
1469
+ "grad_norm": 0.8257766366004944,
1470
  "learning_rate": 8.179878733438132e-05,
1471
+ "loss": 0.0435,
1472
  "step": 2090
1473
  },
1474
  {
1475
  "epoch": 0.672,
1476
+ "grad_norm": 1.689205288887024,
1477
  "learning_rate": 8.168650348079947e-05,
1478
+ "loss": 0.0402,
1479
  "step": 2100
1480
  },
1481
  {
1482
  "epoch": 0.6752,
1483
+ "grad_norm": 0.766890287399292,
1484
  "learning_rate": 8.15742196272176e-05,
1485
+ "loss": 0.0393,
1486
  "step": 2110
1487
  },
1488
  {
1489
  "epoch": 0.6784,
1490
+ "grad_norm": 0.8452049493789673,
1491
  "learning_rate": 8.146193577363576e-05,
1492
+ "loss": 0.037,
1493
  "step": 2120
1494
  },
1495
  {
1496
  "epoch": 0.6816,
1497
+ "grad_norm": 0.6207525134086609,
1498
  "learning_rate": 8.13496519200539e-05,
1499
+ "loss": 0.0406,
1500
  "step": 2130
1501
  },
1502
  {
1503
  "epoch": 0.6848,
1504
+ "grad_norm": 1.4125391244888306,
1505
  "learning_rate": 8.123736806647204e-05,
1506
+ "loss": 0.041,
1507
  "step": 2140
1508
  },
1509
  {
1510
  "epoch": 0.688,
1511
+ "grad_norm": 1.1123186349868774,
1512
  "learning_rate": 8.11250842128902e-05,
1513
+ "loss": 0.0331,
1514
  "step": 2150
1515
  },
1516
  {
1517
  "epoch": 0.6912,
1518
+ "grad_norm": 0.769235372543335,
1519
  "learning_rate": 8.101280035930835e-05,
1520
+ "loss": 0.0432,
1521
  "step": 2160
1522
  },
1523
  {
1524
  "epoch": 0.6944,
1525
+ "grad_norm": 0.7295809984207153,
1526
  "learning_rate": 8.090051650572648e-05,
1527
+ "loss": 0.0395,
1528
  "step": 2170
1529
  },
1530
  {
1531
  "epoch": 0.6976,
1532
+ "grad_norm": 1.110095739364624,
1533
  "learning_rate": 8.078823265214462e-05,
1534
+ "loss": 0.036,
1535
  "step": 2180
1536
  },
1537
  {
1538
  "epoch": 0.7008,
1539
+ "grad_norm": 0.5310615301132202,
1540
  "learning_rate": 8.067594879856277e-05,
1541
+ "loss": 0.0384,
1542
  "step": 2190
1543
  },
1544
  {
1545
  "epoch": 0.704,
1546
+ "grad_norm": 0.8980706930160522,
1547
  "learning_rate": 8.056366494498092e-05,
1548
+ "loss": 0.0331,
1549
  "step": 2200
1550
  },
1551
  {
1552
  "epoch": 0.7072,
1553
+ "grad_norm": 0.9225451946258545,
1554
  "learning_rate": 8.045138109139906e-05,
1555
+ "loss": 0.0385,
1556
  "step": 2210
1557
  },
1558
  {
1559
  "epoch": 0.7104,
1560
+ "grad_norm": 0.6329707503318787,
1561
  "learning_rate": 8.033909723781721e-05,
1562
+ "loss": 0.038,
1563
  "step": 2220
1564
  },
1565
  {
1566
  "epoch": 0.7136,
1567
+ "grad_norm": 0.588439404964447,
1568
  "learning_rate": 8.022681338423536e-05,
1569
+ "loss": 0.0354,
1570
  "step": 2230
1571
  },
1572
  {
1573
  "epoch": 0.7168,
1574
+ "grad_norm": 0.7347375750541687,
1575
  "learning_rate": 8.01145295306535e-05,
1576
+ "loss": 0.0379,
1577
  "step": 2240
1578
  },
1579
  {
1580
  "epoch": 0.72,
1581
+ "grad_norm": 1.577707290649414,
1582
  "learning_rate": 8.000224567707165e-05,
1583
+ "loss": 0.0391,
1584
  "step": 2250
1585
  },
1586
  {
1587
  "epoch": 0.7232,
1588
+ "grad_norm": 1.1854794025421143,
1589
  "learning_rate": 7.988996182348978e-05,
1590
+ "loss": 0.0354,
1591
  "step": 2260
1592
  },
1593
  {
1594
  "epoch": 0.7264,
1595
+ "grad_norm": 0.8479386568069458,
1596
  "learning_rate": 7.977767796990792e-05,
1597
+ "loss": 0.0399,
1598
  "step": 2270
1599
  },
1600
  {
1601
  "epoch": 0.7296,
1602
+ "grad_norm": 0.870980441570282,
1603
  "learning_rate": 7.966539411632607e-05,
1604
+ "loss": 0.0396,
1605
  "step": 2280
1606
  },
1607
  {
1608
  "epoch": 0.7328,
1609
+ "grad_norm": 1.1692736148834229,
1610
  "learning_rate": 7.955311026274422e-05,
1611
+ "loss": 0.0403,
1612
  "step": 2290
1613
  },
1614
  {
1615
  "epoch": 0.736,
1616
+ "grad_norm": 1.4128285646438599,
1617
  "learning_rate": 7.944082640916236e-05,
1618
+ "loss": 0.0408,
1619
  "step": 2300
1620
  },
1621
  {
1622
  "epoch": 0.7392,
1623
+ "grad_norm": 0.9887726306915283,
1624
  "learning_rate": 7.932854255558051e-05,
1625
+ "loss": 0.0382,
1626
  "step": 2310
1627
  },
1628
  {
1629
  "epoch": 0.7424,
1630
+ "grad_norm": 0.6805205345153809,
1631
  "learning_rate": 7.921625870199866e-05,
1632
+ "loss": 0.0344,
1633
  "step": 2320
1634
  },
1635
  {
1636
  "epoch": 0.7456,
1637
+ "grad_norm": 0.6173204779624939,
1638
  "learning_rate": 7.910397484841681e-05,
1639
+ "loss": 0.0339,
1640
  "step": 2330
1641
  },
1642
  {
1643
  "epoch": 0.7488,
1644
+ "grad_norm": 1.1889104843139648,
1645
  "learning_rate": 7.899169099483495e-05,
1646
+ "loss": 0.0362,
1647
  "step": 2340
1648
  },
1649
  {
1650
  "epoch": 0.752,
1651
+ "grad_norm": 0.812631368637085,
1652
  "learning_rate": 7.887940714125309e-05,
1653
+ "loss": 0.0373,
1654
  "step": 2350
1655
  },
1656
  {
1657
  "epoch": 0.7552,
1658
+ "grad_norm": 0.7912474274635315,
1659
  "learning_rate": 7.876712328767124e-05,
1660
+ "loss": 0.0347,
1661
  "step": 2360
1662
  },
1663
  {
1664
  "epoch": 0.7584,
1665
+ "grad_norm": 0.8621821403503418,
1666
  "learning_rate": 7.865483943408937e-05,
1667
+ "loss": 0.039,
1668
  "step": 2370
1669
  },
1670
  {
1671
  "epoch": 0.7616,
1672
+ "grad_norm": 1.7311482429504395,
1673
  "learning_rate": 7.854255558050752e-05,
1674
+ "loss": 0.0334,
1675
  "step": 2380
1676
  },
1677
  {
1678
  "epoch": 0.7648,
1679
+ "grad_norm": 1.260198712348938,
1680
  "learning_rate": 7.843027172692568e-05,
1681
+ "loss": 0.0364,
1682
  "step": 2390
1683
  },
1684
  {
1685
  "epoch": 0.768,
1686
+ "grad_norm": 0.5560032725334167,
1687
  "learning_rate": 7.831798787334381e-05,
1688
+ "loss": 0.0331,
1689
  "step": 2400
1690
  },
1691
  {
1692
  "epoch": 0.7712,
1693
+ "grad_norm": 0.7563883066177368,
1694
  "learning_rate": 7.820570401976196e-05,
1695
+ "loss": 0.0335,
1696
  "step": 2410
1697
  },
1698
  {
1699
  "epoch": 0.7744,
1700
+ "grad_norm": 0.991214394569397,
1701
  "learning_rate": 7.809342016618011e-05,
1702
+ "loss": 0.0348,
1703
  "step": 2420
1704
  },
1705
  {
1706
  "epoch": 0.7776,
1707
+ "grad_norm": 0.5576212406158447,
1708
  "learning_rate": 7.798113631259825e-05,
1709
+ "loss": 0.0344,
1710
  "step": 2430
1711
  },
1712
  {
1713
  "epoch": 0.7808,
1714
+ "grad_norm": 0.9191322326660156,
1715
  "learning_rate": 7.78688524590164e-05,
1716
+ "loss": 0.0376,
1717
  "step": 2440
1718
  },
1719
  {
1720
  "epoch": 0.784,
1721
+ "grad_norm": 4.326545715332031,
1722
  "learning_rate": 7.775656860543454e-05,
1723
+ "loss": 0.0822,
1724
  "step": 2450
1725
  },
1726
  {
1727
  "epoch": 0.7872,
1728
+ "grad_norm": 0.7509451508522034,
1729
  "learning_rate": 7.764428475185269e-05,
1730
+ "loss": 0.0359,
1731
  "step": 2460
1732
  },
1733
  {
1734
  "epoch": 0.7904,
1735
+ "grad_norm": 1.5423306226730347,
1736
  "learning_rate": 7.753200089827083e-05,
1737
+ "loss": 0.0349,
1738
  "step": 2470
1739
  },
1740
  {
1741
  "epoch": 0.7936,
1742
+ "grad_norm": 0.6744070053100586,
1743
  "learning_rate": 7.741971704468898e-05,
1744
+ "loss": 0.0322,
1745
  "step": 2480
1746
  },
1747
  {
1748
  "epoch": 0.7968,
1749
+ "grad_norm": 0.46497827768325806,
1750
  "learning_rate": 7.730743319110713e-05,
1751
+ "loss": 0.0299,
1752
  "step": 2490
1753
  },
1754
  {
1755
  "epoch": 0.8,
1756
+ "grad_norm": 0.7538796067237854,
1757
  "learning_rate": 7.719514933752526e-05,
1758
+ "loss": 0.03,
1759
  "step": 2500
1760
  },
1761
  {
1762
  "epoch": 0.8032,
1763
+ "grad_norm": 1.8027441501617432,
1764
  "learning_rate": 7.708286548394342e-05,
1765
+ "loss": 0.0325,
1766
  "step": 2510
1767
  },
1768
  {
1769
  "epoch": 0.8064,
1770
+ "grad_norm": 0.7862191796302795,
1771
  "learning_rate": 7.697058163036157e-05,
1772
+ "loss": 0.0346,
1773
  "step": 2520
1774
  },
1775
  {
1776
  "epoch": 0.8096,
1777
+ "grad_norm": 0.9970144033432007,
1778
  "learning_rate": 7.68582977767797e-05,
1779
+ "loss": 0.0349,
1780
  "step": 2530
1781
  },
1782
  {
1783
  "epoch": 0.8128,
1784
+ "grad_norm": 0.6299024820327759,
1785
  "learning_rate": 7.674601392319784e-05,
1786
+ "loss": 0.0305,
1787
  "step": 2540
1788
  },
1789
  {
1790
  "epoch": 0.816,
1791
+ "grad_norm": 0.883230447769165,
1792
  "learning_rate": 7.663373006961599e-05,
1793
+ "loss": 0.0384,
1794
  "step": 2550
1795
  },
1796
  {
1797
  "epoch": 0.8192,
1798
+ "grad_norm": 1.0779157876968384,
1799
  "learning_rate": 7.652144621603414e-05,
1800
+ "loss": 0.0377,
1801
  "step": 2560
1802
  },
1803
  {
1804
  "epoch": 0.8224,
1805
+ "grad_norm": 0.8372220396995544,
1806
  "learning_rate": 7.640916236245228e-05,
1807
+ "loss": 0.0299,
1808
  "step": 2570
1809
  },
1810
  {
1811
  "epoch": 0.8256,
1812
+ "grad_norm": 0.5274995565414429,
1813
  "learning_rate": 7.629687850887043e-05,
1814
+ "loss": 0.0338,
1815
  "step": 2580
1816
  },
1817
  {
1818
  "epoch": 0.8288,
1819
+ "grad_norm": 0.8475728034973145,
1820
  "learning_rate": 7.618459465528858e-05,
1821
+ "loss": 0.0314,
1822
  "step": 2590
1823
  },
1824
  {
1825
  "epoch": 0.832,
1826
+ "grad_norm": 1.0869860649108887,
1827
  "learning_rate": 7.607231080170672e-05,
1828
+ "loss": 0.0288,
1829
  "step": 2600
1830
  },
1831
  {
1832
  "epoch": 0.8352,
1833
+ "grad_norm": 0.8045733571052551,
1834
  "learning_rate": 7.596002694812487e-05,
1835
+ "loss": 0.031,
1836
  "step": 2610
1837
  },
1838
  {
1839
  "epoch": 0.8384,
1840
+ "grad_norm": 0.815394937992096,
1841
  "learning_rate": 7.5847743094543e-05,
1842
+ "loss": 0.028,
1843
  "step": 2620
1844
  },
1845
  {
1846
  "epoch": 0.8416,
1847
+ "grad_norm": 0.7129034996032715,
1848
  "learning_rate": 7.573545924096114e-05,
1849
+ "loss": 0.031,
1850
  "step": 2630
1851
  },
1852
  {
1853
  "epoch": 0.8448,
1854
+ "grad_norm": 0.6846269965171814,
1855
  "learning_rate": 7.562317538737929e-05,
1856
+ "loss": 0.036,
1857
  "step": 2640
1858
  },
1859
  {
1860
  "epoch": 0.848,
1861
+ "grad_norm": 2.122080087661743,
1862
  "learning_rate": 7.551089153379744e-05,
1863
+ "loss": 0.0301,
1864
  "step": 2650
1865
  },
1866
  {
1867
  "epoch": 0.8512,
1868
+ "grad_norm": 1.3065335750579834,
1869
  "learning_rate": 7.53986076802156e-05,
1870
+ "loss": 0.0314,
1871
  "step": 2660
1872
  },
1873
  {
1874
  "epoch": 0.8544,
1875
+ "grad_norm": 1.0742933750152588,
1876
  "learning_rate": 7.528632382663373e-05,
1877
+ "loss": 0.0318,
1878
  "step": 2670
1879
  },
1880
  {
1881
  "epoch": 0.8576,
1882
+ "grad_norm": 0.6569668650627136,
1883
  "learning_rate": 7.517403997305188e-05,
1884
+ "loss": 0.0281,
1885
  "step": 2680
1886
  },
1887
  {
1888
  "epoch": 0.8608,
1889
+ "grad_norm": 0.6425995826721191,
1890
  "learning_rate": 7.506175611947003e-05,
1891
+ "loss": 0.0282,
1892
  "step": 2690
1893
  },
1894
  {
1895
  "epoch": 0.864,
1896
+ "grad_norm": 0.6735262870788574,
1897
  "learning_rate": 7.494947226588817e-05,
1898
+ "loss": 0.0303,
1899
  "step": 2700
1900
  },
1901
  {
1902
  "epoch": 0.8672,
1903
+ "grad_norm": 0.7769756317138672,
1904
  "learning_rate": 7.483718841230631e-05,
1905
+ "loss": 0.0283,
1906
  "step": 2710
1907
  },
1908
  {
1909
  "epoch": 0.8704,
1910
+ "grad_norm": 0.7856665849685669,
1911
  "learning_rate": 7.472490455872446e-05,
1912
+ "loss": 0.0278,
1913
  "step": 2720
1914
  },
1915
  {
1916
  "epoch": 0.8736,
1917
+ "grad_norm": 0.9197099208831787,
1918
  "learning_rate": 7.46126207051426e-05,
1919
+ "loss": 0.0317,
1920
  "step": 2730
1921
  },
1922
  {
1923
  "epoch": 0.8768,
1924
+ "grad_norm": 0.4260849952697754,
1925
  "learning_rate": 7.450033685156075e-05,
1926
+ "loss": 0.0287,
1927
  "step": 2740
1928
  },
1929
  {
1930
  "epoch": 0.88,
1931
+ "grad_norm": 0.8799973130226135,
1932
  "learning_rate": 7.43880529979789e-05,
1933
+ "loss": 0.0294,
1934
  "step": 2750
1935
  },
1936
  {
1937
  "epoch": 0.8832,
1938
+ "grad_norm": 1.2114665508270264,
1939
  "learning_rate": 7.427576914439703e-05,
1940
+ "loss": 0.0331,
1941
  "step": 2760
1942
  },
1943
  {
1944
  "epoch": 0.8864,
1945
+ "grad_norm": 0.6824879050254822,
1946
  "learning_rate": 7.416348529081518e-05,
1947
+ "loss": 0.0296,
1948
  "step": 2770
1949
  },
1950
  {
1951
  "epoch": 0.8896,
1952
+ "grad_norm": 1.274061918258667,
1953
  "learning_rate": 7.405120143723333e-05,
1954
+ "loss": 0.0289,
1955
  "step": 2780
1956
  },
1957
  {
1958
  "epoch": 0.8928,
1959
+ "grad_norm": 0.6117509603500366,
1960
  "learning_rate": 7.393891758365149e-05,
1961
+ "loss": 0.0255,
1962
  "step": 2790
1963
  },
1964
  {
1965
  "epoch": 0.896,
1966
+ "grad_norm": 0.5068930983543396,
1967
  "learning_rate": 7.382663373006962e-05,
1968
+ "loss": 0.0281,
1969
  "step": 2800
1970
  },
1971
  {
1972
  "epoch": 0.8992,
1973
+ "grad_norm": 0.5966737270355225,
1974
  "learning_rate": 7.371434987648776e-05,
1975
+ "loss": 0.026,
1976
  "step": 2810
1977
  },
1978
  {
1979
  "epoch": 0.9024,
1980
+ "grad_norm": 0.7945902347564697,
1981
  "learning_rate": 7.360206602290591e-05,
1982
+ "loss": 0.0271,
1983
  "step": 2820
1984
  },
1985
  {
1986
  "epoch": 0.9056,
1987
+ "grad_norm": 0.7903382182121277,
1988
  "learning_rate": 7.348978216932405e-05,
1989
+ "loss": 0.0261,
1990
  "step": 2830
1991
  },
1992
  {
1993
  "epoch": 0.9088,
1994
+ "grad_norm": 1.1210882663726807,
1995
  "learning_rate": 7.33774983157422e-05,
1996
+ "loss": 0.0319,
1997
  "step": 2840
1998
  },
1999
  {
2000
  "epoch": 0.912,
2001
+ "grad_norm": 0.7618020176887512,
2002
  "learning_rate": 7.326521446216035e-05,
2003
+ "loss": 0.0319,
2004
  "step": 2850
2005
  },
2006
  {
2007
  "epoch": 0.9152,
2008
+ "grad_norm": 0.5775326490402222,
2009
  "learning_rate": 7.315293060857849e-05,
2010
+ "loss": 0.0293,
2011
  "step": 2860
2012
  },
2013
  {
2014
  "epoch": 0.9184,
2015
+ "grad_norm": 0.532517671585083,
2016
  "learning_rate": 7.304064675499664e-05,
2017
+ "loss": 0.0272,
2018
  "step": 2870
2019
  },
2020
  {
2021
  "epoch": 0.9216,
2022
+ "grad_norm": 1.226197361946106,
2023
  "learning_rate": 7.292836290141479e-05,
2024
+ "loss": 0.0278,
2025
  "step": 2880
2026
  },
2027
  {
2028
  "epoch": 0.9248,
2029
+ "grad_norm": 0.5860388875007629,
2030
  "learning_rate": 7.281607904783292e-05,
2031
+ "loss": 0.0274,
2032
  "step": 2890
2033
  },
2034
  {
2035
  "epoch": 0.928,
2036
+ "grad_norm": 0.4597097337245941,
2037
  "learning_rate": 7.270379519425106e-05,
2038
+ "loss": 0.0259,
2039
  "step": 2900
2040
  },
2041
  {
2042
  "epoch": 0.9312,
2043
+ "grad_norm": 0.654680073261261,
2044
  "learning_rate": 7.259151134066921e-05,
2045
+ "loss": 0.0258,
2046
  "step": 2910
2047
  },
2048
  {
2049
  "epoch": 0.9344,
2050
+ "grad_norm": 0.47539767622947693,
2051
  "learning_rate": 7.247922748708736e-05,
2052
+ "loss": 0.023,
2053
  "step": 2920
2054
  },
2055
  {
2056
  "epoch": 0.9376,
2057
+ "grad_norm": 0.5486841201782227,
2058
  "learning_rate": 7.23669436335055e-05,
2059
+ "loss": 0.0227,
2060
  "step": 2930
2061
  },
2062
  {
2063
  "epoch": 0.9408,
2064
+ "grad_norm": 0.9400618672370911,
2065
  "learning_rate": 7.225465977992365e-05,
2066
+ "loss": 0.029,
2067
  "step": 2940
2068
  },
2069
  {
2070
  "epoch": 0.944,
2071
+ "grad_norm": 0.8965266942977905,
2072
  "learning_rate": 7.21423759263418e-05,
2073
+ "loss": 0.029,
2074
  "step": 2950
2075
  },
2076
  {
2077
  "epoch": 0.9472,
2078
+ "grad_norm": 0.6586723923683167,
2079
  "learning_rate": 7.203009207275994e-05,
2080
+ "loss": 0.0241,
2081
  "step": 2960
2082
  },
2083
  {
2084
  "epoch": 0.9504,
2085
+ "grad_norm": 0.7518870830535889,
2086
  "learning_rate": 7.191780821917809e-05,
2087
+ "loss": 0.0275,
2088
  "step": 2970
2089
  },
2090
  {
2091
  "epoch": 0.9536,
2092
+ "grad_norm": 1.0358211994171143,
2093
  "learning_rate": 7.180552436559623e-05,
2094
+ "loss": 0.0249,
2095
  "step": 2980
2096
  },
2097
  {
2098
  "epoch": 0.9568,
2099
+ "grad_norm": 0.6047177314758301,
2100
  "learning_rate": 7.169324051201438e-05,
2101
+ "loss": 0.029,
2102
  "step": 2990
2103
  },
2104
  {
2105
  "epoch": 0.96,
2106
+ "grad_norm": 0.6489489078521729,
2107
  "learning_rate": 7.158095665843251e-05,
2108
+ "loss": 0.0259,
2109
  "step": 3000
2110
  },
2111
  {
2112
  "epoch": 0.9632,
2113
+ "grad_norm": 0.6869999766349792,
2114
  "learning_rate": 7.146867280485066e-05,
2115
+ "loss": 0.0245,
2116
  "step": 3010
2117
  },
2118
  {
2119
  "epoch": 0.9664,
2120
+ "grad_norm": 0.5727821588516235,
2121
  "learning_rate": 7.135638895126882e-05,
2122
+ "loss": 0.0236,
2123
  "step": 3020
2124
  },
2125
  {
2126
  "epoch": 0.9696,
2127
+ "grad_norm": 0.9350295066833496,
2128
  "learning_rate": 7.124410509768695e-05,
2129
+ "loss": 0.0253,
2130
  "step": 3030
2131
  },
2132
  {
2133
  "epoch": 0.9728,
2134
+ "grad_norm": 0.8133239150047302,
2135
  "learning_rate": 7.11318212441051e-05,
2136
+ "loss": 0.0266,
2137
  "step": 3040
2138
  },
2139
  {
2140
  "epoch": 0.976,
2141
+ "grad_norm": 0.39781591296195984,
2142
  "learning_rate": 7.101953739052325e-05,
2143
+ "loss": 0.0241,
2144
  "step": 3050
2145
  },
2146
  {
2147
  "epoch": 0.9792,
2148
+ "grad_norm": 0.8405945897102356,
2149
  "learning_rate": 7.090725353694139e-05,
2150
+ "loss": 0.0253,
2151
  "step": 3060
2152
  },
2153
  {
2154
  "epoch": 0.9824,
2155
+ "grad_norm": 0.6194741129875183,
2156
  "learning_rate": 7.079496968335954e-05,
2157
+ "loss": 0.0253,
2158
  "step": 3070
2159
  },
2160
  {
2161
  "epoch": 0.9856,
2162
+ "grad_norm": 0.5068514943122864,
2163
  "learning_rate": 7.068268582977768e-05,
2164
+ "loss": 0.0226,
2165
  "step": 3080
2166
  },
2167
  {
2168
  "epoch": 0.9888,
2169
+ "grad_norm": 0.8302213549613953,
2170
  "learning_rate": 7.057040197619582e-05,
2171
+ "loss": 0.0237,
2172
  "step": 3090
2173
  },
2174
  {
2175
  "epoch": 0.992,
2176
+ "grad_norm": 0.9723827838897705,
2177
  "learning_rate": 7.045811812261397e-05,
2178
+ "loss": 0.0256,
2179
  "step": 3100
2180
  },
2181
  {
2182
  "epoch": 0.9952,
2183
+ "grad_norm": 0.9212355017662048,
2184
  "learning_rate": 7.034583426903212e-05,
2185
+ "loss": 0.0243,
2186
  "step": 3110
2187
  },
2188
  {
2189
  "epoch": 0.9984,
2190
+ "grad_norm": 0.3177473247051239,
2191
  "learning_rate": 7.023355041545027e-05,
2192
+ "loss": 0.0201,
2193
  "step": 3120
2194
  }
2195
  ],
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:58dc4832b9ecbedb58e177e8210247010a0ec93903efa66a51b87b3bc91d64e4
3
  size 5304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1a838d849888b7d1ac8e4ee59146d9c162c6903d6b779d4f8666af6807ac008
3
  size 5304