End of training

Browse files

Files changed (6) hide show

README.md +16 -2
all_results.json +9 -9
eval_results.json +5 -5
log_history.json +574 -574
train_results.json +4 -4
trainer_state.json +575 -575

README.md CHANGED Viewed

@@ -1,6 +1,7 @@
 ---
 library_name: transformers
 tags:
 - generated_from_trainer
 datasets:
 - voxceleb
@@ -8,7 +9,20 @@ metrics:
 - accuracy
 model-index:
 - name: xvector-voxceleb1
-  results: []
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -16,7 +30,7 @@ should probably proofread and complete it, then remove this comment. -->
 # xvector-voxceleb1
-This model is a fine-tuned version of [](https://huggingface.co/) on the voxceleb dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.2981
 - Accuracy: 0.9405

 ---
 library_name: transformers
 tags:
+- audio-classification
 - generated_from_trainer
 datasets:
 - voxceleb
 - accuracy
 model-index:
 - name: xvector-voxceleb1
+  results:
+  - task:
+      name: Audio Classification
+      type: audio-classification
+    dataset:
+      name: confit/voxceleb
+      type: voxceleb
+      config: verification
+      split: train
+      args: verification
+    metrics:
+    - name: Accuracy
+      type: accuracy
+      value: 0.9405314497140935
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 # xvector-voxceleb1
+This model is a fine-tuned version of [](https://huggingface.co/) on the confit/voxceleb dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.2981
 - Accuracy: 0.9405

all_results.json CHANGED Viewed

@@ -1,13 +1,13 @@
 {
     "epoch": 10.0,
-    "eval_accuracy": 0.9410023545240498,
-    "eval_loss": 0.29460111260414124,
-    "eval_runtime": 49.9476,
-    "eval_samples_per_second": 297.612,
-    "eval_steps_per_second": 297.612,
     "total_flos": 1.96318398191328e+18,
-    "train_loss": 2.1099621225725396,
-    "train_runtime": 19361.582,
-    "train_samples_per_second": 69.094,
-    "train_steps_per_second": 0.27
 }

 {
     "epoch": 10.0,
+    "eval_accuracy": 0.9405314497140935,
+    "eval_loss": 0.29811325669288635,
+    "eval_runtime": 45.8572,
+    "eval_samples_per_second": 324.159,
+    "eval_steps_per_second": 324.159,
     "total_flos": 1.96318398191328e+18,
+    "train_loss": 2.122584131525306,
+    "train_runtime": 50832.5393,
+    "train_samples_per_second": 26.317,
+    "train_steps_per_second": 0.103
 }

eval_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
     "epoch": 10.0,
-    "eval_accuracy": 0.9410023545240498,
-    "eval_loss": 0.29460111260414124,
-    "eval_runtime": 49.9476,
-    "eval_samples_per_second": 297.612,
-    "eval_steps_per_second": 297.612
 }

 {
     "epoch": 10.0,
+    "eval_accuracy": 0.9405314497140935,
+    "eval_loss": 0.29811325669288635,
+    "eval_runtime": 45.8572,
+    "eval_samples_per_second": 324.159,
+    "eval_steps_per_second": 324.159
 }

log_history.json CHANGED Viewed

@@ -1,1927 +1,1927 @@
 [
     {
-        "loss": 7.1456,
-        "grad_norm": 4.7170538902282715,
         "learning_rate": 3.824091778202677e-05,
         "epoch": 0.03824091778202677,
         "step": 20
     },
     {
-        "loss": 7.1158,
-        "grad_norm": 4.5134358406066895,
         "learning_rate": 7.648183556405354e-05,
         "epoch": 0.07648183556405354,
         "step": 40
     },
     {
-        "loss": 7.0575,
-        "grad_norm": 3.9098806381225586,
         "learning_rate": 0.0001147227533460803,
         "epoch": 0.1147227533460803,
         "step": 60
     },
     {
-        "loss": 6.9743,
-        "grad_norm": 3.2200050354003906,
         "learning_rate": 0.00015296367112810707,
         "epoch": 0.15296367112810708,
         "step": 80
     },
     {
-        "loss": 6.874,
-        "grad_norm": 2.5479934215545654,
         "learning_rate": 0.00019120458891013384,
         "epoch": 0.19120458891013384,
         "step": 100
     },
     {
-        "loss": 6.7278,
-        "grad_norm": 2.168301820755005,
         "learning_rate": 0.0002294455066921606,
         "epoch": 0.2294455066921606,
         "step": 120
     },
     {
-        "loss": 6.5778,
-        "grad_norm": 2.0083394050598145,
         "learning_rate": 0.0002676864244741874,
         "epoch": 0.2676864244741874,
         "step": 140
     },
     {
-        "loss": 6.4192,
-        "grad_norm": 1.8298897743225098,
         "learning_rate": 0.00030592734225621415,
         "epoch": 0.30592734225621415,
         "step": 160
     },
     {
-        "loss": 6.2416,
-        "grad_norm": 1.782423734664917,
         "learning_rate": 0.00034416826003824094,
         "epoch": 0.3441682600382409,
         "step": 180
     },
     {
-        "loss": 6.0624,
-        "grad_norm": 1.8139146566390991,
         "learning_rate": 0.0003824091778202677,
         "epoch": 0.3824091778202677,
         "step": 200
     },
     {
-        "loss": 5.9686,
-        "grad_norm": 1.7659958600997925,
         "learning_rate": 0.0004206500956022944,
         "epoch": 0.42065009560229444,
         "step": 220
     },
     {
-        "loss": 5.8142,
-        "grad_norm": 1.8660094738006592,
         "learning_rate": 0.0004588910133843212,
         "epoch": 0.4588910133843212,
         "step": 240
     },
     {
-        "loss": 5.6944,
-        "grad_norm": 1.831566333770752,
         "learning_rate": 0.0004971319311663481,
         "epoch": 0.497131931166348,
         "step": 260
     },
     {
-        "loss": 5.6101,
-        "grad_norm": 1.7546241283416748,
         "learning_rate": 0.0005353728489483748,
         "epoch": 0.5353728489483748,
         "step": 280
     },
     {
-        "loss": 5.5192,
-        "grad_norm": 1.8890600204467773,
         "learning_rate": 0.0005736137667304016,
         "epoch": 0.5736137667304015,
         "step": 300
     },
     {
-        "loss": 5.3822,
-        "grad_norm": 1.7542874813079834,
         "learning_rate": 0.0006118546845124283,
         "epoch": 0.6118546845124283,
         "step": 320
     },
     {
-        "loss": 5.3236,
-        "grad_norm": 1.8762731552124023,
         "learning_rate": 0.000650095602294455,
         "epoch": 0.6500956022944551,
         "step": 340
     },
     {
-        "loss": 5.2483,
-        "grad_norm": 1.886903166770935,
         "learning_rate": 0.0006883365200764819,
         "epoch": 0.6883365200764818,
         "step": 360
     },
     {
-        "loss": 5.1245,
-        "grad_norm": 1.9873583316802979,
         "learning_rate": 0.0007265774378585086,
         "epoch": 0.7265774378585086,
         "step": 380
     },
     {
-        "loss": 5.0771,
-        "grad_norm": 1.953506350517273,
         "learning_rate": 0.0007648183556405354,
         "epoch": 0.7648183556405354,
         "step": 400
     },
     {
-        "loss": 5.0354,
-        "grad_norm": 1.851192831993103,
         "learning_rate": 0.0008030592734225621,
         "epoch": 0.8030592734225621,
         "step": 420
     },
     {
-        "loss": 4.9532,
-        "grad_norm": 1.861971139907837,
         "learning_rate": 0.0008413001912045888,
         "epoch": 0.8413001912045889,
         "step": 440
     },
     {
-        "loss": 4.8698,
-        "grad_norm": 1.9388970136642456,
         "learning_rate": 0.0008795411089866157,
         "epoch": 0.8795411089866156,
         "step": 460
     },
     {
-        "loss": 4.847,
-        "grad_norm": 1.919184684753418,
         "learning_rate": 0.0009177820267686424,
         "epoch": 0.9177820267686424,
         "step": 480
     },
     {
-        "loss": 4.7754,
-        "grad_norm": 1.8886795043945312,
         "learning_rate": 0.0009560229445506692,
         "epoch": 0.9560229445506692,
         "step": 500
     },
     {
-        "loss": 4.6728,
-        "grad_norm": 1.9832813739776611,
         "learning_rate": 0.0009942638623326961,
         "epoch": 0.994263862332696,
         "step": 520
     },
     {
-        "eval_loss": 4.345639228820801,
-        "eval_accuracy": 0.15042045072317525,
-        "eval_runtime": 211.2136,
-        "eval_samples_per_second": 70.379,
-        "eval_steps_per_second": 70.379,
         "epoch": 1.0,
         "step": 523
     },
     {
-        "loss": 4.5657,
-        "grad_norm": 1.831260323524475,
         "learning_rate": 0.0009963883577650309,
         "epoch": 1.0325047801147227,
         "step": 540
     },
     {
-        "loss": 4.5047,
-        "grad_norm": 1.934414029121399,
         "learning_rate": 0.0009921393669003612,
         "epoch": 1.0707456978967496,
         "step": 560
     },
     {
-        "loss": 4.4165,
-        "grad_norm": 1.7718721628189087,
         "learning_rate": 0.0009878903760356915,
         "epoch": 1.1089866156787762,
         "step": 580
     },
     {
-        "loss": 4.3933,
-        "grad_norm": 1.741455078125,
         "learning_rate": 0.0009836413851710218,
         "epoch": 1.147227533460803,
         "step": 600
     },
     {
-        "loss": 4.3249,
-        "grad_norm": 1.8857481479644775,
         "learning_rate": 0.0009793923943063523,
         "epoch": 1.1854684512428297,
         "step": 620
     },
     {
-        "loss": 4.2147,
-        "grad_norm": 1.8325748443603516,
         "learning_rate": 0.0009751434034416827,
         "epoch": 1.2237093690248566,
         "step": 640
     },
     {
-        "loss": 4.1569,
-        "grad_norm": 1.8758591413497925,
         "learning_rate": 0.000970894412577013,
         "epoch": 1.2619502868068833,
         "step": 660
     },
     {
-        "loss": 4.131,
-        "grad_norm": 1.899542212486267,
         "learning_rate": 0.0009666454217123433,
         "epoch": 1.3001912045889101,
         "step": 680
     },
     {
-        "loss": 4.0467,
-        "grad_norm": 1.8188538551330566,
         "learning_rate": 0.0009623964308476737,
         "epoch": 1.338432122370937,
         "step": 700
     },
     {
-        "loss": 3.9904,
-        "grad_norm": 1.7679705619812012,
         "learning_rate": 0.000958147439983004,
         "epoch": 1.3766730401529637,
         "step": 720
     },
     {
-        "loss": 3.9464,
-        "grad_norm": 1.849482774734497,
         "learning_rate": 0.0009538984491183344,
         "epoch": 1.4149139579349903,
         "step": 740
     },
     {
-        "loss": 3.9104,
-        "grad_norm": 1.8237632513046265,
         "learning_rate": 0.0009496494582536647,
         "epoch": 1.4531548757170172,
         "step": 760
     },
     {
-        "loss": 3.8441,
-        "grad_norm": 1.8175936937332153,
         "learning_rate": 0.0009454004673889951,
         "epoch": 1.491395793499044,
         "step": 780
     },
     {
-        "loss": 3.7898,
-        "grad_norm": 1.7967997789382935,
         "learning_rate": 0.0009411514765243255,
         "epoch": 1.5296367112810707,
         "step": 800
     },
     {
-        "loss": 3.6894,
-        "grad_norm": 1.7681634426116943,
         "learning_rate": 0.0009369024856596558,
         "epoch": 1.5678776290630974,
         "step": 820
     },
     {
-        "loss": 3.6798,
-        "grad_norm": 1.8655925989151,
         "learning_rate": 0.0009326534947949862,
         "epoch": 1.6061185468451242,
         "step": 840
     },
     {
-        "loss": 3.6297,
-        "grad_norm": 1.853769302368164,
         "learning_rate": 0.0009284045039303166,
         "epoch": 1.644359464627151,
         "step": 860
     },
     {
-        "loss": 3.5592,
-        "grad_norm": 1.8198288679122925,
         "learning_rate": 0.0009241555130656469,
         "epoch": 1.682600382409178,
         "step": 880
     },
     {
-        "loss": 3.5056,
-        "grad_norm": 1.7744460105895996,
         "learning_rate": 0.0009199065222009773,
         "epoch": 1.7208413001912046,
         "step": 900
     },
     {
-        "loss": 3.4635,
-        "grad_norm": 1.797914981842041,
         "learning_rate": 0.0009156575313363077,
         "epoch": 1.7590822179732313,
         "step": 920
     },
     {
-        "loss": 3.4434,
-        "grad_norm": 1.8479169607162476,
         "learning_rate": 0.000911408540471638,
         "epoch": 1.7973231357552581,
         "step": 940
     },
     {
-        "loss": 3.441,
-        "grad_norm": 1.818405032157898,
         "learning_rate": 0.0009071595496069684,
         "epoch": 1.835564053537285,
         "step": 960
     },
     {
-        "loss": 3.3934,
-        "grad_norm": 1.7609572410583496,
         "learning_rate": 0.0009029105587422988,
         "epoch": 1.8738049713193117,
         "step": 980
     },
     {
-        "loss": 3.2961,
-        "grad_norm": 1.7228211164474487,
         "learning_rate": 0.0008986615678776291,
         "epoch": 1.9120458891013383,
         "step": 1000
     },
     {
-        "loss": 3.2611,
-        "grad_norm": 1.8148291110992432,
         "learning_rate": 0.0008944125770129595,
         "epoch": 1.9502868068833652,
         "step": 1020
     },
     {
-        "loss": 3.224,
-        "grad_norm": 1.933300495147705,
         "learning_rate": 0.0008901635861482899,
         "epoch": 1.988527724665392,
         "step": 1040
     },
     {
-        "eval_loss": 2.258894205093384,
-        "eval_accuracy": 0.5140935082408342,
-        "eval_runtime": 203.9361,
-        "eval_samples_per_second": 72.89,
-        "eval_steps_per_second": 72.89,
         "epoch": 2.0,
         "step": 1046
     },
     {
-        "loss": 3.1667,
-        "grad_norm": 1.819346308708191,
         "learning_rate": 0.0008859145952836202,
         "epoch": 2.026768642447419,
         "step": 1060
     },
     {
-        "loss": 3.0232,
-        "grad_norm": 1.7024896144866943,
         "learning_rate": 0.0008816656044189504,
         "epoch": 2.0650095602294454,
         "step": 1080
     },
     {
-        "loss": 3.0489,
-        "grad_norm": 1.7023948431015015,
         "learning_rate": 0.000877416613554281,
         "epoch": 2.1032504780114722,
         "step": 1100
     },
     {
-        "loss": 2.9732,
-        "grad_norm": 1.804140329360962,
         "learning_rate": 0.0008731676226896112,
         "epoch": 2.141491395793499,
         "step": 1120
     },
     {
-        "loss": 2.9562,
-        "grad_norm": 1.7260992527008057,
         "learning_rate": 0.0008689186318249415,
         "epoch": 2.179732313575526,
         "step": 1140
     },
     {
-        "loss": 2.8875,
-        "grad_norm": 1.7970356941223145,
         "learning_rate": 0.000864669640960272,
         "epoch": 2.2179732313575524,
         "step": 1160
     },
     {
-        "loss": 2.916,
-        "grad_norm": 1.8579261302947998,
         "learning_rate": 0.0008604206500956023,
         "epoch": 2.2562141491395793,
         "step": 1180
     },
     {
-        "loss": 2.8963,
-        "grad_norm": 1.852342128753662,
         "learning_rate": 0.0008561716592309326,
         "epoch": 2.294455066921606,
         "step": 1200
     },
     {
-        "loss": 2.8029,
-        "grad_norm": 1.8845752477645874,
         "learning_rate": 0.000851922668366263,
         "epoch": 2.332695984703633,
         "step": 1220
     },
     {
-        "loss": 2.8237,
-        "grad_norm": 1.883952260017395,
         "learning_rate": 0.0008476736775015934,
         "epoch": 2.3709369024856595,
         "step": 1240
     },
     {
-        "loss": 2.8473,
-        "grad_norm": 1.8383756875991821,
         "learning_rate": 0.0008434246866369237,
         "epoch": 2.4091778202676863,
         "step": 1260
     },
     {
-        "loss": 2.7722,
-        "grad_norm": 1.8900470733642578,
         "learning_rate": 0.0008391756957722541,
         "epoch": 2.447418738049713,
         "step": 1280
     },
     {
-        "loss": 2.7584,
-        "grad_norm": 1.8097845315933228,
         "learning_rate": 0.0008349267049075845,
         "epoch": 2.48565965583174,
         "step": 1300
     },
     {
-        "loss": 2.7134,
-        "grad_norm": 1.7215895652770996,
         "learning_rate": 0.0008306777140429148,
         "epoch": 2.5239005736137665,
         "step": 1320
     },
     {
-        "loss": 2.6531,
-        "grad_norm": 1.8249051570892334,
         "learning_rate": 0.0008264287231782451,
         "epoch": 2.5621414913957934,
         "step": 1340
     },
     {
-        "loss": 2.6675,
-        "grad_norm": 1.8082237243652344,
         "learning_rate": 0.0008221797323135756,
         "epoch": 2.6003824091778203,
         "step": 1360
     },
     {
-        "loss": 2.5702,
-        "grad_norm": 1.7981261014938354,
         "learning_rate": 0.0008179307414489059,
         "epoch": 2.638623326959847,
         "step": 1380
     },
     {
-        "loss": 2.6339,
-        "grad_norm": 1.6964036226272583,
         "learning_rate": 0.0008136817505842362,
         "epoch": 2.676864244741874,
         "step": 1400
     },
     {
-        "loss": 2.5489,
-        "grad_norm": 1.755050778388977,
         "learning_rate": 0.0008094327597195667,
         "epoch": 2.7151051625239004,
         "step": 1420
     },
     {
-        "loss": 2.5908,
-        "grad_norm": 1.7242581844329834,
         "learning_rate": 0.000805183768854897,
         "epoch": 2.7533460803059273,
         "step": 1440
     },
     {
-        "loss": 2.5143,
-        "grad_norm": 1.819612741470337,
         "learning_rate": 0.0008009347779902273,
         "epoch": 2.791586998087954,
         "step": 1460
     },
     {
-        "loss": 2.4662,
-        "grad_norm": 1.7033363580703735,
         "learning_rate": 0.0007966857871255578,
         "epoch": 2.8298279158699806,
         "step": 1480
     },
     {
-        "loss": 2.4044,
-        "grad_norm": 1.7662159204483032,
         "learning_rate": 0.000792436796260888,
         "epoch": 2.8680688336520075,
         "step": 1500
     },
     {
-        "loss": 2.4636,
-        "grad_norm": 1.7460269927978516,
         "learning_rate": 0.0007881878053962183,
         "epoch": 2.9063097514340344,
         "step": 1520
     },
     {
-        "loss": 2.3955,
-        "grad_norm": 1.8268380165100098,
         "learning_rate": 0.0007839388145315488,
         "epoch": 2.9445506692160612,
         "step": 1540
     },
     {
-        "loss": 2.3964,
-        "grad_norm": 1.796981930732727,
         "learning_rate": 0.0007796898236668791,
         "epoch": 2.982791586998088,
         "step": 1560
     },
     {
-        "eval_loss": 1.4662528038024902,
-        "eval_accuracy": 0.6835519677093844,
-        "eval_runtime": 419.2044,
-        "eval_samples_per_second": 35.46,
-        "eval_steps_per_second": 35.46,
         "epoch": 3.0,
         "step": 1569
     },
     {
-        "loss": 2.3174,
-        "grad_norm": 1.7852272987365723,
         "learning_rate": 0.0007754408328022094,
         "epoch": 3.0210325047801145,
         "step": 1580
     },
     {
-        "loss": 2.2913,
-        "grad_norm": 1.8464534282684326,
         "learning_rate": 0.0007711918419375399,
         "epoch": 3.0592734225621414,
         "step": 1600
     },
     {
-        "loss": 2.2856,
-        "grad_norm": 1.7783145904541016,
         "learning_rate": 0.0007669428510728702,
         "epoch": 3.0975143403441683,
         "step": 1620
     },
     {
-        "loss": 2.2099,
-        "grad_norm": 1.744454264640808,
         "learning_rate": 0.0007626938602082005,
         "epoch": 3.135755258126195,
         "step": 1640
     },
     {
-        "loss": 2.23,
-        "grad_norm": 1.8276797533035278,
         "learning_rate": 0.0007584448693435309,
         "epoch": 3.173996175908222,
         "step": 1660
     },
     {
-        "loss": 2.1912,
-        "grad_norm": 1.8144315481185913,
         "learning_rate": 0.0007541958784788613,
         "epoch": 3.2122370936902485,
         "step": 1680
     },
     {
-        "loss": 2.1818,
-        "grad_norm": 1.8499830961227417,
         "learning_rate": 0.0007499468876141916,
         "epoch": 3.2504780114722753,
         "step": 1700
     },
     {
-        "loss": 2.1349,
-        "grad_norm": 1.7623099088668823,
         "learning_rate": 0.000745697896749522,
         "epoch": 3.288718929254302,
         "step": 1720
     },
     {
-        "loss": 2.1055,
-        "grad_norm": 1.8180640935897827,
         "learning_rate": 0.0007414489058848524,
         "epoch": 3.3269598470363286,
         "step": 1740
     },
     {
-        "loss": 2.1077,
-        "grad_norm": 1.8159964084625244,
         "learning_rate": 0.0007371999150201827,
         "epoch": 3.3652007648183555,
         "step": 1760
     },
     {
-        "loss": 2.0999,
-        "grad_norm": 1.7902129888534546,
         "learning_rate": 0.0007329509241555131,
         "epoch": 3.4034416826003824,
         "step": 1780
     },
     {
-        "loss": 2.1188,
-        "grad_norm": 1.7685898542404175,
         "learning_rate": 0.0007287019332908435,
         "epoch": 3.4416826003824093,
         "step": 1800
     },
     {
-        "loss": 2.0956,
-        "grad_norm": 1.758325219154358,
         "learning_rate": 0.0007244529424261738,
         "epoch": 3.479923518164436,
         "step": 1820
     },
     {
-        "loss": 2.0488,
-        "grad_norm": 1.7802537679672241,
         "learning_rate": 0.0007202039515615042,
         "epoch": 3.5181644359464626,
         "step": 1840
     },
     {
-        "loss": 2.0776,
-        "grad_norm": 1.8220280408859253,
         "learning_rate": 0.0007159549606968346,
         "epoch": 3.5564053537284894,
         "step": 1860
     },
     {
-        "loss": 2.0257,
-        "grad_norm": 1.8494378328323364,
         "learning_rate": 0.0007117059698321649,
         "epoch": 3.5946462715105163,
         "step": 1880
     },
     {
-        "loss": 2.0332,
-        "grad_norm": 1.719109296798706,
         "learning_rate": 0.0007074569789674953,
         "epoch": 3.632887189292543,
         "step": 1900
     },
     {
-        "loss": 2.0052,
-        "grad_norm": 1.9517509937286377,
         "learning_rate": 0.0007032079881028257,
         "epoch": 3.67112810707457,
         "step": 1920
     },
     {
-        "loss": 1.9901,
-        "grad_norm": 1.7318940162658691,
         "learning_rate": 0.0006989589972381559,
         "epoch": 3.7093690248565965,
         "step": 1940
     },
     {
-        "loss": 1.9411,
-        "grad_norm": 1.767015814781189,
         "learning_rate": 0.0006947100063734863,
         "epoch": 3.7476099426386233,
         "step": 1960
     },
     {
-        "loss": 2.0048,
-        "grad_norm": 1.761806607246399,
         "learning_rate": 0.0006904610155088166,
         "epoch": 3.78585086042065,
         "step": 1980
     },
     {
-        "loss": 1.9312,
-        "grad_norm": 1.7126002311706543,
         "learning_rate": 0.000686212024644147,
         "epoch": 3.8240917782026767,
         "step": 2000
     },
     {
-        "loss": 1.9443,
-        "grad_norm": 1.7167437076568604,
         "learning_rate": 0.0006819630337794774,
         "epoch": 3.8623326959847035,
         "step": 2020
     },
     {
-        "loss": 1.893,
-        "grad_norm": 1.749881386756897,
         "learning_rate": 0.0006777140429148077,
         "epoch": 3.9005736137667304,
         "step": 2040
     },
     {
-        "loss": 1.8876,
-        "grad_norm": 1.6846592426300049,
         "learning_rate": 0.0006734650520501381,
         "epoch": 3.9388145315487573,
         "step": 2060
     },
     {
-        "loss": 1.8474,
-        "grad_norm": 1.8149057626724243,
         "learning_rate": 0.0006692160611854685,
         "epoch": 3.977055449330784,
         "step": 2080
     },
     {
-        "eval_loss": 0.9547563195228577,
-        "eval_accuracy": 0.7926673393878237,
-        "eval_runtime": 183.3001,
-        "eval_samples_per_second": 81.097,
-        "eval_steps_per_second": 81.097,
         "epoch": 4.0,
         "step": 2092
     },
     {
-        "loss": 1.813,
-        "grad_norm": 1.6455098390579224,
         "learning_rate": 0.0006649670703207988,
         "epoch": 4.015296367112811,
         "step": 2100
     },
     {
-        "loss": 1.7354,
-        "grad_norm": 1.6958200931549072,
         "learning_rate": 0.0006607180794561292,
         "epoch": 4.053537284894838,
         "step": 2120
     },
     {
-        "loss": 1.7479,
-        "grad_norm": 1.7456037998199463,
         "learning_rate": 0.0006564690885914596,
         "epoch": 4.091778202676864,
         "step": 2140
     },
     {
-        "loss": 1.7138,
-        "grad_norm": 1.7887734174728394,
         "learning_rate": 0.0006522200977267899,
         "epoch": 4.130019120458891,
         "step": 2160
     },
     {
-        "loss": 1.7023,
-        "grad_norm": 1.7080284357070923,
         "learning_rate": 0.0006479711068621203,
         "epoch": 4.168260038240918,
         "step": 2180
     },
     {
-        "loss": 1.7526,
-        "grad_norm": 1.8061983585357666,
         "learning_rate": 0.0006437221159974506,
         "epoch": 4.2065009560229445,
         "step": 2200
     },
     {
-        "loss": 1.7474,
-        "grad_norm": 1.7831811904907227,
         "learning_rate": 0.000639473125132781,
         "epoch": 4.244741873804971,
         "step": 2220
     },
     {
-        "loss": 1.6688,
-        "grad_norm": 1.752357840538025,
         "learning_rate": 0.0006352241342681113,
         "epoch": 4.282982791586998,
         "step": 2240
     },
     {
-        "loss": 1.7009,
-        "grad_norm": 1.7843034267425537,
         "learning_rate": 0.0006309751434034417,
         "epoch": 4.321223709369025,
         "step": 2260
     },
     {
-        "loss": 1.6727,
-        "grad_norm": 1.7608367204666138,
         "learning_rate": 0.0006267261525387721,
         "epoch": 4.359464627151052,
         "step": 2280
     },
     {
-        "loss": 1.6801,
-        "grad_norm": 1.6877254247665405,
         "learning_rate": 0.0006224771616741024,
         "epoch": 4.397705544933078,
         "step": 2300
     },
     {
-        "loss": 1.7108,
-        "grad_norm": 1.7891350984573364,
         "learning_rate": 0.0006182281708094328,
         "epoch": 4.435946462715105,
         "step": 2320
     },
     {
-        "loss": 1.6442,
-        "grad_norm": 1.7104123830795288,
         "learning_rate": 0.0006139791799447631,
         "epoch": 4.474187380497132,
         "step": 2340
     },
     {
-        "loss": 1.6531,
-        "grad_norm": 1.7026969194412231,
         "learning_rate": 0.0006097301890800934,
         "epoch": 4.512428298279159,
         "step": 2360
     },
     {
-        "loss": 1.6539,
-        "grad_norm": 1.7890552282333374,
         "learning_rate": 0.0006054811982154238,
         "epoch": 4.550669216061186,
         "step": 2380
     },
     {
-        "loss": 1.6681,
-        "grad_norm": 1.8423861265182495,
         "learning_rate": 0.0006012322073507542,
         "epoch": 4.588910133843212,
         "step": 2400
     },
     {
-        "loss": 1.5935,
-        "grad_norm": 1.6434499025344849,
         "learning_rate": 0.0005969832164860845,
         "epoch": 4.627151051625239,
         "step": 2420
     },
     {
-        "loss": 1.6273,
-        "grad_norm": 1.7261130809783936,
         "learning_rate": 0.0005927342256214149,
         "epoch": 4.665391969407266,
         "step": 2440
     },
     {
-        "loss": 1.6181,
-        "grad_norm": 1.7288273572921753,
         "learning_rate": 0.0005884852347567453,
         "epoch": 4.7036328871892925,
         "step": 2460
     },
     {
-        "loss": 1.5719,
-        "grad_norm": 1.773258924484253,
         "learning_rate": 0.0005842362438920756,
         "epoch": 4.741873804971319,
         "step": 2480
     },
     {
-        "loss": 1.578,
-        "grad_norm": 1.7676658630371094,
         "learning_rate": 0.000579987253027406,
         "epoch": 4.780114722753346,
         "step": 2500
     },
     {
-        "loss": 1.535,
-        "grad_norm": 1.8115794658660889,
         "learning_rate": 0.0005757382621627364,
         "epoch": 4.818355640535373,
         "step": 2520
     },
     {
-        "loss": 1.5493,
-        "grad_norm": 1.7989414930343628,
         "learning_rate": 0.0005714892712980667,
         "epoch": 4.8565965583174,
         "step": 2540
     },
     {
-        "loss": 1.5489,
-        "grad_norm": 1.6607849597930908,
         "learning_rate": 0.000567240280433397,
         "epoch": 4.894837476099426,
         "step": 2560
     },
     {
-        "loss": 1.5091,
-        "grad_norm": 1.630257487297058,
         "learning_rate": 0.0005629912895687275,
         "epoch": 4.933078393881453,
         "step": 2580
     },
     {
-        "loss": 1.5275,
-        "grad_norm": 1.7995944023132324,
         "learning_rate": 0.0005587422987040578,
         "epoch": 4.97131931166348,
         "step": 2600
     },
     {
-        "eval_loss": 0.6697778105735779,
-        "eval_accuracy": 0.8571140262361251,
-        "eval_runtime": 181.3784,
-        "eval_samples_per_second": 81.956,
-        "eval_steps_per_second": 81.956,
         "epoch": 5.0,
         "step": 2615
     },
     {
-        "loss": 1.4774,
-        "grad_norm": 1.7868553400039673,
         "learning_rate": 0.0005544933078393881,
         "epoch": 5.009560229445507,
         "step": 2620
     },
     {
-        "loss": 1.3955,
-        "grad_norm": 1.6380654573440552,
         "learning_rate": 0.0005502443169747186,
         "epoch": 5.047801147227533,
         "step": 2640
     },
     {
-        "loss": 1.4414,
-        "grad_norm": 1.7844533920288086,
         "learning_rate": 0.0005459953261100489,
         "epoch": 5.08604206500956,
         "step": 2660
     },
     {
-        "loss": 1.3782,
-        "grad_norm": 1.779080867767334,
         "learning_rate": 0.0005417463352453792,
         "epoch": 5.124282982791587,
         "step": 2680
     },
     {
-        "loss": 1.4152,
-        "grad_norm": 1.741326928138733,
         "learning_rate": 0.0005374973443807097,
         "epoch": 5.162523900573614,
         "step": 2700
     },
     {
-        "loss": 1.3996,
-        "grad_norm": 1.7447401285171509,
         "learning_rate": 0.00053324835351604,
         "epoch": 5.2007648183556405,
         "step": 2720
     },
     {
-        "loss": 1.4137,
-        "grad_norm": 1.8067736625671387,
         "learning_rate": 0.0005289993626513702,
         "epoch": 5.239005736137667,
         "step": 2740
     },
     {
-        "loss": 1.3937,
-        "grad_norm": 1.7393046617507935,
         "learning_rate": 0.0005247503717867008,
         "epoch": 5.277246653919694,
         "step": 2760
     },
     {
-        "loss": 1.3912,
-        "grad_norm": 1.756184458732605,
         "learning_rate": 0.000520501380922031,
         "epoch": 5.315487571701721,
         "step": 2780
     },
     {
-        "loss": 1.387,
-        "grad_norm": 1.7133733034133911,
         "learning_rate": 0.0005162523900573613,
         "epoch": 5.353728489483748,
         "step": 2800
     },
     {
-        "loss": 1.3551,
-        "grad_norm": 1.6597713232040405,
         "learning_rate": 0.0005120033991926918,
         "epoch": 5.3919694072657744,
         "step": 2820
     },
     {
-        "loss": 1.3557,
-        "grad_norm": 1.8462845087051392,
         "learning_rate": 0.0005077544083280221,
         "epoch": 5.430210325047801,
         "step": 2840
     },
     {
-        "loss": 1.3495,
-        "grad_norm": 1.6737143993377686,
         "learning_rate": 0.0005035054174633524,
         "epoch": 5.468451242829828,
         "step": 2860
     },
     {
-        "loss": 1.394,
-        "grad_norm": 1.7071157693862915,
         "learning_rate": 0.0004992564265986828,
         "epoch": 5.506692160611855,
         "step": 2880
     },
     {
-        "loss": 1.3263,
-        "grad_norm": 1.663072943687439,
         "learning_rate": 0.0004950074357340132,
         "epoch": 5.544933078393882,
         "step": 2900
     },
     {
-        "loss": 1.3474,
-        "grad_norm": 1.640093207359314,
         "learning_rate": 0.0004907584448693436,
         "epoch": 5.583173996175908,
         "step": 2920
     },
     {
-        "loss": 1.3375,
-        "grad_norm": 1.762568712234497,
         "learning_rate": 0.0004865094540046739,
         "epoch": 5.621414913957935,
         "step": 2940
     },
     {
-        "loss": 1.3218,
-        "grad_norm": 1.6714434623718262,
         "learning_rate": 0.00048226046314000425,
         "epoch": 5.659655831739962,
         "step": 2960
     },
     {
-        "loss": 1.3008,
-        "grad_norm": 1.7594107389450073,
         "learning_rate": 0.0004780114722753346,
         "epoch": 5.6978967495219885,
         "step": 2980
     },
     {
-        "loss": 1.3331,
-        "grad_norm": 1.6483973264694214,
         "learning_rate": 0.000473762481410665,
         "epoch": 5.736137667304015,
         "step": 3000
     },
     {
-        "loss": 1.2775,
-        "grad_norm": 1.7252651453018188,
         "learning_rate": 0.00046951349054599533,
         "epoch": 5.774378585086042,
         "step": 3020
     },
     {
-        "loss": 1.2747,
-        "grad_norm": 1.7860745191574097,
         "learning_rate": 0.0004652644996813257,
         "epoch": 5.812619502868069,
         "step": 3040
     },
     {
-        "loss": 1.2946,
-        "grad_norm": 1.749874234199524,
         "learning_rate": 0.0004610155088166561,
         "epoch": 5.850860420650095,
         "step": 3060
     },
     {
-        "loss": 1.2849,
-        "grad_norm": 1.7197644710540771,
         "learning_rate": 0.0004567665179519864,
         "epoch": 5.8891013384321225,
         "step": 3080
     },
     {
-        "loss": 1.2544,
-        "grad_norm": 1.6396132707595825,
         "learning_rate": 0.00045251752708731676,
         "epoch": 5.927342256214149,
         "step": 3100
     },
     {
-        "loss": 1.248,
-        "grad_norm": 1.720376968383789,
         "learning_rate": 0.0004482685362226471,
         "epoch": 5.965583173996176,
         "step": 3120
     },
     {
-        "eval_loss": 0.5270123481750488,
-        "eval_accuracy": 0.8899428187016482,
-        "eval_runtime": 183.7621,
-        "eval_samples_per_second": 80.893,
-        "eval_steps_per_second": 80.893,
         "epoch": 6.0,
         "step": 3138
     },
     {
-        "loss": 1.2398,
-        "grad_norm": 1.5206599235534668,
         "learning_rate": 0.0004440195453579775,
         "epoch": 6.003824091778203,
         "step": 3140
     },
     {
-        "loss": 1.2097,
-        "grad_norm": 1.7172082662582397,
         "learning_rate": 0.00043977055449330785,
         "epoch": 6.042065009560229,
         "step": 3160
     },
     {
-        "loss": 1.1669,
-        "grad_norm": 1.5570909976959229,
         "learning_rate": 0.0004355215636286382,
         "epoch": 6.080305927342256,
         "step": 3180
     },
     {
-        "loss": 1.1647,
-        "grad_norm": 1.7044614553451538,
         "learning_rate": 0.0004312725727639686,
         "epoch": 6.118546845124283,
         "step": 3200
     },
     {
-        "loss": 1.1627,
-        "grad_norm": 1.5819571018218994,
         "learning_rate": 0.0004270235818992989,
         "epoch": 6.15678776290631,
         "step": 3220
     },
     {
-        "loss": 1.1728,
-        "grad_norm": 1.7076871395111084,
         "learning_rate": 0.0004227745910346293,
         "epoch": 6.195028680688337,
         "step": 3240
     },
     {
-        "loss": 1.1459,
-        "grad_norm": 1.7301490306854248,
         "learning_rate": 0.0004185256001699597,
         "epoch": 6.233269598470363,
         "step": 3260
     },
     {
-        "loss": 1.1676,
-        "grad_norm": 1.7135626077651978,
         "learning_rate": 0.00041427660930528997,
         "epoch": 6.27151051625239,
         "step": 3280
     },
     {
-        "loss": 1.1488,
-        "grad_norm": 1.602142572402954,
         "learning_rate": 0.00041002761844062037,
         "epoch": 6.309751434034417,
         "step": 3300
     },
     {
-        "loss": 1.1395,
-        "grad_norm": 1.755293846130371,
         "learning_rate": 0.00040577862757595076,
         "epoch": 6.347992351816444,
         "step": 3320
     },
     {
-        "loss": 1.1321,
-        "grad_norm": 1.663662314414978,
         "learning_rate": 0.00040152963671128105,
         "epoch": 6.3862332695984705,
         "step": 3340
     },
     {
-        "loss": 1.1317,
-        "grad_norm": 1.7366993427276611,
         "learning_rate": 0.00039728064584661145,
         "epoch": 6.424474187380497,
         "step": 3360
     },
     {
-        "loss": 1.1449,
-        "grad_norm": 1.7560149431228638,
         "learning_rate": 0.0003930316549819418,
         "epoch": 6.462715105162524,
         "step": 3380
     },
     {
-        "loss": 1.13,
-        "grad_norm": 1.7576582431793213,
         "learning_rate": 0.00038878266411727214,
         "epoch": 6.500956022944551,
         "step": 3400
     },
     {
-        "loss": 1.1419,
-        "grad_norm": 1.7916873693466187,
         "learning_rate": 0.00038453367325260254,
         "epoch": 6.539196940726577,
         "step": 3420
     },
     {
-        "loss": 1.1107,
-        "grad_norm": 1.5987508296966553,
         "learning_rate": 0.0003802846823879329,
         "epoch": 6.577437858508604,
         "step": 3440
     },
     {
-        "loss": 1.1162,
-        "grad_norm": 1.8192518949508667,
         "learning_rate": 0.0003760356915232632,
         "epoch": 6.615678776290631,
         "step": 3460
     },
     {
-        "loss": 1.1255,
-        "grad_norm": 1.7236486673355103,
         "learning_rate": 0.0003717867006585936,
         "epoch": 6.653919694072657,
         "step": 3480
     },
     {
-        "loss": 1.0629,
-        "grad_norm": 1.8209389448165894,
         "learning_rate": 0.0003675377097939239,
         "epoch": 6.692160611854685,
         "step": 3500
     },
     {
-        "loss": 1.0809,
-        "grad_norm": 1.652782678604126,
         "learning_rate": 0.0003632887189292543,
         "epoch": 6.730401529636711,
         "step": 3520
     },
     {
-        "loss": 1.1286,
-        "grad_norm": 1.6148645877838135,
         "learning_rate": 0.00035903972806458466,
         "epoch": 6.768642447418738,
         "step": 3540
     },
     {
-        "loss": 1.1069,
-        "grad_norm": 1.6869423389434814,
         "learning_rate": 0.000354790737199915,
         "epoch": 6.806883365200765,
         "step": 3560
     },
     {
-        "loss": 1.0911,
-        "grad_norm": 1.6373172998428345,
         "learning_rate": 0.0003505417463352454,
         "epoch": 6.845124282982791,
         "step": 3580
     },
     {
-        "loss": 1.0808,
-        "grad_norm": 1.6761549711227417,
         "learning_rate": 0.00034629275547057574,
         "epoch": 6.8833652007648185,
         "step": 3600
     },
     {
-        "loss": 1.0809,
-        "grad_norm": 1.6510460376739502,
         "learning_rate": 0.0003420437646059061,
         "epoch": 6.921606118546845,
         "step": 3620
     },
     {
         "loss": 1.0912,
-        "grad_norm": 1.7351855039596558,
         "learning_rate": 0.0003377947737412365,
         "epoch": 6.959847036328872,
         "step": 3640
     },
     {
-        "loss": 1.0991,
-        "grad_norm": 1.7165274620056152,
         "learning_rate": 0.00033354578287656683,
         "epoch": 6.998087954110899,
         "step": 3660
     },
     {
-        "eval_loss": 0.44995447993278503,
-        "eval_accuracy": 0.9037336024217961,
-        "eval_runtime": 189.5634,
-        "eval_samples_per_second": 78.417,
-        "eval_steps_per_second": 78.417,
         "epoch": 7.0,
         "step": 3661
     },
     {
-        "loss": 1.0154,
-        "grad_norm": 1.6468501091003418,
         "learning_rate": 0.0003292967920118972,
         "epoch": 7.036328871892925,
         "step": 3680
     },
     {
-        "loss": 1.0378,
-        "grad_norm": 1.79421067237854,
         "learning_rate": 0.0003250478011472275,
         "epoch": 7.074569789674952,
         "step": 3700
     },
     {
-        "loss": 1.0145,
-        "grad_norm": 1.7234885692596436,
         "learning_rate": 0.0003207988102825579,
         "epoch": 7.112810707456979,
         "step": 3720
     },
     {
-        "loss": 1.0012,
-        "grad_norm": 1.6947157382965088,
         "learning_rate": 0.00031654981941788826,
         "epoch": 7.151051625239006,
         "step": 3740
     },
     {
-        "loss": 1.0191,
-        "grad_norm": 1.6818758249282837,
         "learning_rate": 0.0003123008285532186,
         "epoch": 7.189292543021033,
         "step": 3760
     },
     {
-        "loss": 1.0437,
-        "grad_norm": 1.557080864906311,
         "learning_rate": 0.000308051837688549,
         "epoch": 7.227533460803059,
         "step": 3780
     },
     {
-        "loss": 1.0079,
-        "grad_norm": 1.6532793045043945,
         "learning_rate": 0.00030380284682387935,
         "epoch": 7.265774378585086,
         "step": 3800
     },
     {
-        "loss": 0.9904,
-        "grad_norm": 1.646686315536499,
         "learning_rate": 0.0002995538559592097,
         "epoch": 7.304015296367113,
         "step": 3820
     },
     {
-        "loss": 1.0002,
-        "grad_norm": 1.6772829294204712,
         "learning_rate": 0.0002953048650945401,
         "epoch": 7.342256214149139,
         "step": 3840
     },
     {
-        "loss": 0.9674,
-        "grad_norm": 1.6452054977416992,
         "learning_rate": 0.0002910558742298704,
         "epoch": 7.3804971319311665,
         "step": 3860
     },
     {
-        "loss": 0.9642,
-        "grad_norm": 1.592207908630371,
         "learning_rate": 0.0002868068833652008,
         "epoch": 7.418738049713193,
         "step": 3880
     },
     {
-        "loss": 0.973,
-        "grad_norm": 1.7015941143035889,
         "learning_rate": 0.0002825578925005312,
         "epoch": 7.45697896749522,
         "step": 3900
     },
     {
-        "loss": 0.9803,
-        "grad_norm": 1.6589232683181763,
         "learning_rate": 0.00027830890163586146,
         "epoch": 7.495219885277247,
         "step": 3920
     },
     {
-        "loss": 0.9702,
-        "grad_norm": 1.660190463066101,
         "learning_rate": 0.00027405991077119186,
         "epoch": 7.533460803059273,
         "step": 3940
     },
     {
-        "loss": 0.9919,
-        "grad_norm": 1.7052509784698486,
         "learning_rate": 0.00026981091990652226,
         "epoch": 7.5717017208413,
         "step": 3960
     },
     {
-        "loss": 0.938,
-        "grad_norm": 1.6874445676803589,
         "learning_rate": 0.00026556192904185255,
         "epoch": 7.609942638623327,
         "step": 3980
     },
     {
-        "loss": 0.9564,
-        "grad_norm": 1.811640739440918,
         "learning_rate": 0.00026131293817718295,
         "epoch": 7.648183556405353,
         "step": 4000
     },
     {
-        "loss": 0.9432,
-        "grad_norm": 1.741968035697937,
         "learning_rate": 0.00025706394731251324,
         "epoch": 7.686424474187381,
         "step": 4020
     },
     {
-        "loss": 0.9082,
-        "grad_norm": 1.6731518507003784,
         "learning_rate": 0.00025281495644784364,
         "epoch": 7.724665391969407,
         "step": 4040
     },
     {
-        "loss": 0.92,
-        "grad_norm": 1.7399870157241821,
         "learning_rate": 0.00024856596558317403,
         "epoch": 7.762906309751434,
         "step": 4060
     },
     {
-        "loss": 0.9433,
-        "grad_norm": 1.580674171447754,
         "learning_rate": 0.0002443169747185044,
         "epoch": 7.801147227533461,
         "step": 4080
     },
     {
-        "loss": 0.9584,
-        "grad_norm": 1.5683550834655762,
         "learning_rate": 0.00024006798385383472,
         "epoch": 7.839388145315487,
         "step": 4100
     },
     {
-        "loss": 0.9318,
-        "grad_norm": 1.7664682865142822,
         "learning_rate": 0.00023581899298916507,
         "epoch": 7.8776290630975145,
         "step": 4120
     },
     {
-        "loss": 0.8852,
-        "grad_norm": 1.5522887706756592,
         "learning_rate": 0.00023157000212449544,
         "epoch": 7.915869980879541,
         "step": 4140
     },
     {
-        "loss": 0.9121,
-        "grad_norm": 1.626836895942688,
         "learning_rate": 0.00022732101125982578,
         "epoch": 7.954110898661568,
         "step": 4160
     },
     {
-        "loss": 0.9221,
-        "grad_norm": 1.8042898178100586,
         "learning_rate": 0.00022307202039515615,
         "epoch": 7.992351816443595,
         "step": 4180
     },
     {
-        "eval_loss": 0.3572401702404022,
-        "eval_accuracy": 0.9266733938782374,
-        "eval_runtime": 168.0336,
-        "eval_samples_per_second": 88.464,
-        "eval_steps_per_second": 88.464,
         "epoch": 8.0,
         "step": 4184
     },
     {
-        "loss": 0.8979,
-        "grad_norm": 1.545024037361145,
         "learning_rate": 0.00021882302953048652,
         "epoch": 8.030592734225621,
         "step": 4200
     },
     {
-        "loss": 0.8464,
-        "grad_norm": 1.592607855796814,
         "learning_rate": 0.00021457403866581687,
         "epoch": 8.068833652007648,
         "step": 4220
     },
     {
-        "loss": 0.8634,
-        "grad_norm": 1.5347646474838257,
         "learning_rate": 0.0002103250478011472,
         "epoch": 8.107074569789676,
         "step": 4240
     },
     {
-        "loss": 0.8425,
-        "grad_norm": 1.621201515197754,
         "learning_rate": 0.0002060760569364776,
         "epoch": 8.145315487571702,
         "step": 4260
     },
     {
-        "loss": 0.8776,
-        "grad_norm": 1.7381062507629395,
         "learning_rate": 0.00020182706607180795,
         "epoch": 8.183556405353729,
         "step": 4280
     },
     {
-        "loss": 0.854,
-        "grad_norm": 1.5798373222351074,
         "learning_rate": 0.0001975780752071383,
         "epoch": 8.221797323135755,
         "step": 4300
     },
     {
-        "loss": 0.8646,
-        "grad_norm": 1.5751338005065918,
         "learning_rate": 0.00019332908434246867,
         "epoch": 8.260038240917781,
         "step": 4320
     },
     {
-        "loss": 0.8521,
-        "grad_norm": 1.570742130279541,
         "learning_rate": 0.00018908009347779904,
         "epoch": 8.29827915869981,
         "step": 4340
     },
     {
-        "loss": 0.8858,
-        "grad_norm": 1.7959846258163452,
         "learning_rate": 0.00018483110261312938,
         "epoch": 8.336520076481836,
         "step": 4360
     },
     {
-        "loss": 0.8707,
-        "grad_norm": 1.7537908554077148,
         "learning_rate": 0.00018058211174845973,
         "epoch": 8.374760994263863,
         "step": 4380
     },
     {
-        "loss": 0.8375,
-        "grad_norm": 1.635578989982605,
         "learning_rate": 0.0001763331208837901,
         "epoch": 8.413001912045889,
         "step": 4400
     },
     {
-        "loss": 0.8631,
-        "grad_norm": 1.5729222297668457,
         "learning_rate": 0.00017208413001912047,
         "epoch": 8.451242829827915,
         "step": 4420
     },
     {
-        "loss": 0.8551,
-        "grad_norm": 1.6586476564407349,
         "learning_rate": 0.00016783513915445082,
         "epoch": 8.489483747609942,
         "step": 4440
     },
     {
-        "loss": 0.8637,
-        "grad_norm": 1.6118619441986084,
         "learning_rate": 0.00016358614828978119,
         "epoch": 8.52772466539197,
         "step": 4460
     },
     {
-        "loss": 0.8484,
-        "grad_norm": 1.5538595914840698,
         "learning_rate": 0.00015933715742511153,
         "epoch": 8.565965583173996,
         "step": 4480
     },
     {
-        "loss": 0.8433,
-        "grad_norm": 1.5646642446517944,
         "learning_rate": 0.0001550881665604419,
         "epoch": 8.604206500956023,
         "step": 4500
     },
     {
-        "loss": 0.8592,
-        "grad_norm": 1.7190415859222412,
         "learning_rate": 0.00015083917569577227,
         "epoch": 8.64244741873805,
         "step": 4520
     },
     {
-        "loss": 0.8236,
-        "grad_norm": 1.4950307607650757,
         "learning_rate": 0.00014659018483110262,
         "epoch": 8.680688336520076,
         "step": 4540
     },
     {
-        "loss": 0.8421,
-        "grad_norm": 1.5117732286453247,
         "learning_rate": 0.00014234119396643296,
         "epoch": 8.718929254302104,
         "step": 4560
     },
     {
-        "loss": 0.8287,
-        "grad_norm": 1.5558750629425049,
         "learning_rate": 0.00013809220310176336,
         "epoch": 8.75717017208413,
         "step": 4580
     },
     {
-        "loss": 0.8492,
-        "grad_norm": 1.7955564260482788,
         "learning_rate": 0.0001338432122370937,
         "epoch": 8.795411089866157,
         "step": 4600
     },
     {
-        "loss": 0.8419,
-        "grad_norm": 1.6532599925994873,
         "learning_rate": 0.00012959422137242405,
         "epoch": 8.833652007648183,
         "step": 4620
     },
     {
-        "loss": 0.8125,
-        "grad_norm": 1.7040739059448242,
         "learning_rate": 0.0001253452305077544,
         "epoch": 8.87189292543021,
         "step": 4640
     },
     {
-        "loss": 0.8187,
-        "grad_norm": 1.7040703296661377,
         "learning_rate": 0.00012109623964308478,
         "epoch": 8.910133843212238,
         "step": 4660
     },
     {
-        "loss": 0.8155,
-        "grad_norm": 1.7090845108032227,
         "learning_rate": 0.00011684724877841513,
         "epoch": 8.948374760994264,
         "step": 4680
     },
     {
-        "loss": 0.7997,
-        "grad_norm": 1.6070616245269775,
         "learning_rate": 0.00011259825791374549,
         "epoch": 8.98661567877629,
         "step": 4700
     },
     {
-        "eval_loss": 0.3138497769832611,
         "eval_accuracy": 0.9352842246888665,
-        "eval_runtime": 177.561,
-        "eval_samples_per_second": 83.718,
-        "eval_steps_per_second": 83.718,
         "epoch": 9.0,
         "step": 4707
     },
     {
-        "loss": 0.7906,
-        "grad_norm": 1.5590825080871582,
         "learning_rate": 0.00010834926704907585,
         "epoch": 9.024856596558317,
         "step": 4720
     },
     {
-        "loss": 0.7494,
-        "grad_norm": 1.4745252132415771,
         "learning_rate": 0.0001041002761844062,
         "epoch": 9.063097514340344,
         "step": 4740
     },
     {
-        "loss": 0.7854,
-        "grad_norm": 1.61099112033844,
         "learning_rate": 9.985128531973658e-05,
         "epoch": 9.101338432122372,
         "step": 4760
     },
     {
-        "loss": 0.7636,
-        "grad_norm": 1.5839650630950928,
         "learning_rate": 9.560229445506692e-05,
         "epoch": 9.139579349904398,
         "step": 4780
     },
     {
-        "loss": 0.776,
-        "grad_norm": 1.7259138822555542,
         "learning_rate": 9.135330359039729e-05,
         "epoch": 9.177820267686425,
         "step": 4800
     },
     {
-        "loss": 0.7444,
-        "grad_norm": 1.5495970249176025,
         "learning_rate": 8.710431272572764e-05,
         "epoch": 9.216061185468451,
         "step": 4820
     },
     {
-        "loss": 0.7603,
-        "grad_norm": 1.5250838994979858,
         "learning_rate": 8.2855321861058e-05,
         "epoch": 9.254302103250478,
         "step": 4840
     },
     {
-        "loss": 0.7561,
-        "grad_norm": 1.6244220733642578,
         "learning_rate": 7.860633099638836e-05,
         "epoch": 9.292543021032504,
         "step": 4860
     },
     {
-        "loss": 0.7908,
-        "grad_norm": 1.6825993061065674,
         "learning_rate": 7.435734013171871e-05,
         "epoch": 9.330783938814532,
         "step": 4880
     },
     {
-        "loss": 0.7517,
-        "grad_norm": 1.563707947731018,
         "learning_rate": 7.010834926704908e-05,
         "epoch": 9.369024856596559,
         "step": 4900
     },
     {
-        "loss": 0.7679,
-        "grad_norm": 1.7463629245758057,
         "learning_rate": 6.585935840237942e-05,
         "epoch": 9.407265774378585,
         "step": 4920
     },
     {
-        "loss": 0.7426,
-        "grad_norm": 1.5689053535461426,
         "learning_rate": 6.16103675377098e-05,
         "epoch": 9.445506692160611,
         "step": 4940
     },
     {
-        "loss": 0.7695,
-        "grad_norm": 1.6512914896011353,
         "learning_rate": 5.736137667304015e-05,
         "epoch": 9.483747609942638,
         "step": 4960
     },
     {
-        "loss": 0.7603,
-        "grad_norm": 1.6542084217071533,
         "learning_rate": 5.311238580837052e-05,
         "epoch": 9.521988527724666,
         "step": 4980
     },
     {
-        "loss": 0.754,
-        "grad_norm": 1.6929945945739746,
         "learning_rate": 4.8863394943700874e-05,
         "epoch": 9.560229445506693,
         "step": 5000
     },
     {
-        "loss": 0.7597,
-        "grad_norm": 1.4880517721176147,
         "learning_rate": 4.461440407903123e-05,
         "epoch": 9.598470363288719,
         "step": 5020
     },
     {
-        "loss": 0.7624,
-        "grad_norm": 1.578971266746521,
         "learning_rate": 4.036541321436159e-05,
         "epoch": 9.636711281070745,
         "step": 5040
     },
     {
-        "loss": 0.7653,
-        "grad_norm": 1.616727352142334,
         "learning_rate": 3.6116422349691954e-05,
         "epoch": 9.674952198852772,
         "step": 5060
     },
     {
-        "loss": 0.7352,
-        "grad_norm": 1.6762784719467163,
         "learning_rate": 3.186743148502231e-05,
         "epoch": 9.7131931166348,
         "step": 5080
     },
     {
-        "loss": 0.732,
-        "grad_norm": 1.5666388273239136,
         "learning_rate": 2.7618440620352666e-05,
         "epoch": 9.751434034416826,
         "step": 5100
     },
     {
-        "loss": 0.7631,
-        "grad_norm": 1.641012191772461,
         "learning_rate": 2.3369449755683023e-05,
         "epoch": 9.789674952198853,
         "step": 5120
     },
     {
-        "loss": 0.7153,
-        "grad_norm": 1.7024327516555786,
         "learning_rate": 1.9120458891013384e-05,
         "epoch": 9.82791586998088,
         "step": 5140
     },
     {
-        "loss": 0.7247,
-        "grad_norm": 1.4840829372406006,
         "learning_rate": 1.4871468026343743e-05,
         "epoch": 9.866156787762906,
         "step": 5160
     },
     {
-        "loss": 0.7289,
-        "grad_norm": 1.627562165260315,
         "learning_rate": 1.0622477161674103e-05,
         "epoch": 9.904397705544934,
         "step": 5180
     },
     {
-        "loss": 0.7563,
-        "grad_norm": 1.6473079919815063,
         "learning_rate": 6.373486297004461e-06,
         "epoch": 9.94263862332696,
         "step": 5200
     },
     {
-        "loss": 0.7603,
-        "grad_norm": 1.577776312828064,
         "learning_rate": 2.1244954323348204e-06,
         "epoch": 9.980879541108987,
         "step": 5220
     },
     {
-        "eval_loss": 0.29460111260414124,
-        "eval_accuracy": 0.9410023545240498,
-        "eval_runtime": 187.2975,
-        "eval_samples_per_second": 79.366,
-        "eval_steps_per_second": 79.366,
         "epoch": 10.0,
         "step": 5230
     },
     {
-        "train_runtime": 19361.582,
-        "train_samples_per_second": 69.094,
-        "train_steps_per_second": 0.27,
         "total_flos": 1.96318398191328e+18,
-        "train_loss": 2.1099621225725396,
         "epoch": 10.0,
         "step": 5230
     }

 [
     {
+        "loss": 7.1455,
+        "grad_norm": 4.702692031860352,
         "learning_rate": 3.824091778202677e-05,
         "epoch": 0.03824091778202677,
         "step": 20
     },
     {
+        "loss": 7.116,
+        "grad_norm": 4.504467487335205,
         "learning_rate": 7.648183556405354e-05,
         "epoch": 0.07648183556405354,
         "step": 40
     },
     {
+        "loss": 7.0566,
+        "grad_norm": 3.964728832244873,
         "learning_rate": 0.0001147227533460803,
         "epoch": 0.1147227533460803,
         "step": 60
     },
     {
+        "loss": 6.972,
+        "grad_norm": 3.199570417404175,
         "learning_rate": 0.00015296367112810707,
         "epoch": 0.15296367112810708,
         "step": 80
     },
     {
+        "loss": 6.8778,
+        "grad_norm": 2.6367344856262207,
         "learning_rate": 0.00019120458891013384,
         "epoch": 0.19120458891013384,
         "step": 100
     },
     {
+        "loss": 6.733,
+        "grad_norm": 2.1930582523345947,
         "learning_rate": 0.0002294455066921606,
         "epoch": 0.2294455066921606,
         "step": 120
     },
     {
+        "loss": 6.5814,
+        "grad_norm": 1.9982482194900513,
         "learning_rate": 0.0002676864244741874,
         "epoch": 0.2676864244741874,
         "step": 140
     },
     {
+        "loss": 6.4206,
+        "grad_norm": 1.8051823377609253,
         "learning_rate": 0.00030592734225621415,
         "epoch": 0.30592734225621415,
         "step": 160
     },
     {
+        "loss": 6.2424,
+        "grad_norm": 1.757859706878662,
         "learning_rate": 0.00034416826003824094,
         "epoch": 0.3441682600382409,
         "step": 180
     },
     {
+        "loss": 6.0657,
+        "grad_norm": 1.7747690677642822,
         "learning_rate": 0.0003824091778202677,
         "epoch": 0.3824091778202677,
         "step": 200
     },
     {
+        "loss": 5.9722,
+        "grad_norm": 1.7719988822937012,
         "learning_rate": 0.0004206500956022944,
         "epoch": 0.42065009560229444,
         "step": 220
     },
     {
+        "loss": 5.814,
+        "grad_norm": 1.934911847114563,
         "learning_rate": 0.0004588910133843212,
         "epoch": 0.4588910133843212,
         "step": 240
     },
     {
+        "loss": 5.6998,
+        "grad_norm": 1.793244481086731,
         "learning_rate": 0.0004971319311663481,
         "epoch": 0.497131931166348,
         "step": 260
     },
     {
+        "loss": 5.6115,
+        "grad_norm": 1.7790417671203613,
         "learning_rate": 0.0005353728489483748,
         "epoch": 0.5353728489483748,
         "step": 280
     },
     {
+        "loss": 5.5236,
+        "grad_norm": 1.8355106115341187,
         "learning_rate": 0.0005736137667304016,
         "epoch": 0.5736137667304015,
         "step": 300
     },
     {
+        "loss": 5.3894,
+        "grad_norm": 1.7114174365997314,
         "learning_rate": 0.0006118546845124283,
         "epoch": 0.6118546845124283,
         "step": 320
     },
     {
+        "loss": 5.3336,
+        "grad_norm": 1.877690315246582,
         "learning_rate": 0.000650095602294455,
         "epoch": 0.6500956022944551,
         "step": 340
     },
     {
+        "loss": 5.2577,
+        "grad_norm": 1.8771674633026123,
         "learning_rate": 0.0006883365200764819,
         "epoch": 0.6883365200764818,
         "step": 360
     },
     {
+        "loss": 5.1333,
+        "grad_norm": 1.9654275178909302,
         "learning_rate": 0.0007265774378585086,
         "epoch": 0.7265774378585086,
         "step": 380
     },
     {
+        "loss": 5.0882,
+        "grad_norm": 1.9327517747879028,
         "learning_rate": 0.0007648183556405354,
         "epoch": 0.7648183556405354,
         "step": 400
     },
     {
+        "loss": 5.0404,
+        "grad_norm": 1.8918468952178955,
         "learning_rate": 0.0008030592734225621,
         "epoch": 0.8030592734225621,
         "step": 420
     },
     {
+        "loss": 4.9524,
+        "grad_norm": 1.8536239862442017,
         "learning_rate": 0.0008413001912045888,
         "epoch": 0.8413001912045889,
         "step": 440
     },
     {
+        "loss": 4.8673,
+        "grad_norm": 1.8849778175354004,
         "learning_rate": 0.0008795411089866157,
         "epoch": 0.8795411089866156,
         "step": 460
     },
     {
+        "loss": 4.8456,
+        "grad_norm": 1.8270999193191528,
         "learning_rate": 0.0009177820267686424,
         "epoch": 0.9177820267686424,
         "step": 480
     },
     {
+        "loss": 4.7748,
+        "grad_norm": 1.8521556854248047,
         "learning_rate": 0.0009560229445506692,
         "epoch": 0.9560229445506692,
         "step": 500
     },
     {
+        "loss": 4.6869,
+        "grad_norm": 1.9331077337265015,
         "learning_rate": 0.0009942638623326961,
         "epoch": 0.994263862332696,
         "step": 520
     },
     {
+        "eval_loss": 4.119868278503418,
+        "eval_accuracy": 0.19596367305751766,
+        "eval_runtime": 539.7217,
+        "eval_samples_per_second": 27.542,
+        "eval_steps_per_second": 27.542,
         "epoch": 1.0,
         "step": 523
     },
     {
+        "loss": 4.5805,
+        "grad_norm": 1.7894033193588257,
         "learning_rate": 0.0009963883577650309,
         "epoch": 1.0325047801147227,
         "step": 540
     },
     {
+        "loss": 4.5223,
+        "grad_norm": 1.9136412143707275,
         "learning_rate": 0.0009921393669003612,
         "epoch": 1.0707456978967496,
         "step": 560
     },
     {
+        "loss": 4.4213,
+        "grad_norm": 1.8342599868774414,
         "learning_rate": 0.0009878903760356915,
         "epoch": 1.1089866156787762,
         "step": 580
     },
     {
+        "loss": 4.409,
+        "grad_norm": 1.7360094785690308,
         "learning_rate": 0.0009836413851710218,
         "epoch": 1.147227533460803,
         "step": 600
     },
     {
+        "loss": 4.3382,
+        "grad_norm": 1.8302013874053955,
         "learning_rate": 0.0009793923943063523,
         "epoch": 1.1854684512428297,
         "step": 620
     },
     {
+        "loss": 4.2381,
+        "grad_norm": 1.847433090209961,
         "learning_rate": 0.0009751434034416827,
         "epoch": 1.2237093690248566,
         "step": 640
     },
     {
+        "loss": 4.1802,
+        "grad_norm": 1.866734504699707,
         "learning_rate": 0.000970894412577013,
         "epoch": 1.2619502868068833,
         "step": 660
     },
     {
+        "loss": 4.1404,
+        "grad_norm": 1.9123674631118774,
         "learning_rate": 0.0009666454217123433,
         "epoch": 1.3001912045889101,
         "step": 680
     },
     {
+        "loss": 4.059,
+        "grad_norm": 1.8355252742767334,
         "learning_rate": 0.0009623964308476737,
         "epoch": 1.338432122370937,
         "step": 700
     },
     {
+        "loss": 4.0086,
+        "grad_norm": 1.7890186309814453,
         "learning_rate": 0.000958147439983004,
         "epoch": 1.3766730401529637,
         "step": 720
     },
     {
+        "loss": 3.9641,
+        "grad_norm": 1.847299337387085,
         "learning_rate": 0.0009538984491183344,
         "epoch": 1.4149139579349903,
         "step": 740
     },
     {
+        "loss": 3.9291,
+        "grad_norm": 1.8219211101531982,
         "learning_rate": 0.0009496494582536647,
         "epoch": 1.4531548757170172,
         "step": 760
     },
     {
+        "loss": 3.8577,
+        "grad_norm": 1.8026444911956787,
         "learning_rate": 0.0009454004673889951,
         "epoch": 1.491395793499044,
         "step": 780
     },
     {
+        "loss": 3.8234,
+        "grad_norm": 1.7959771156311035,
         "learning_rate": 0.0009411514765243255,
         "epoch": 1.5296367112810707,
         "step": 800
     },
     {
+        "loss": 3.7132,
+        "grad_norm": 1.7638428211212158,
         "learning_rate": 0.0009369024856596558,
         "epoch": 1.5678776290630974,
         "step": 820
     },
     {
+        "loss": 3.7057,
+        "grad_norm": 1.8295230865478516,
         "learning_rate": 0.0009326534947949862,
         "epoch": 1.6061185468451242,
         "step": 840
     },
     {
+        "loss": 3.6656,
+        "grad_norm": 1.8669532537460327,
         "learning_rate": 0.0009284045039303166,
         "epoch": 1.644359464627151,
         "step": 860
     },
     {
+        "loss": 3.591,
+        "grad_norm": 1.8251906633377075,
         "learning_rate": 0.0009241555130656469,
         "epoch": 1.682600382409178,
         "step": 880
     },
     {
+        "loss": 3.5281,
+        "grad_norm": 1.825548529624939,
         "learning_rate": 0.0009199065222009773,
         "epoch": 1.7208413001912046,
         "step": 900
     },
     {
+        "loss": 3.4841,
+        "grad_norm": 1.7811344861984253,
         "learning_rate": 0.0009156575313363077,
         "epoch": 1.7590822179732313,
         "step": 920
     },
     {
+        "loss": 3.4707,
+        "grad_norm": 1.823803186416626,
         "learning_rate": 0.000911408540471638,
         "epoch": 1.7973231357552581,
         "step": 940
     },
     {
+        "loss": 3.4532,
+        "grad_norm": 1.872054100036621,
         "learning_rate": 0.0009071595496069684,
         "epoch": 1.835564053537285,
         "step": 960
     },
     {
+        "loss": 3.4224,
+        "grad_norm": 1.7651457786560059,
         "learning_rate": 0.0009029105587422988,
         "epoch": 1.8738049713193117,
         "step": 980
     },
     {
+        "loss": 3.3199,
+        "grad_norm": 1.7336605787277222,
         "learning_rate": 0.0008986615678776291,
         "epoch": 1.9120458891013383,
         "step": 1000
     },
     {
+        "loss": 3.2814,
+        "grad_norm": 1.8058604001998901,
         "learning_rate": 0.0008944125770129595,
         "epoch": 1.9502868068833652,
         "step": 1020
     },
     {
+        "loss": 3.2423,
+        "grad_norm": 1.9400501251220703,
         "learning_rate": 0.0008901635861482899,
         "epoch": 1.988527724665392,
         "step": 1040
     },
     {
+        "eval_loss": 2.282437801361084,
+        "eval_accuracy": 0.5047426841574167,
+        "eval_runtime": 710.9931,
+        "eval_samples_per_second": 20.907,
+        "eval_steps_per_second": 20.907,
         "epoch": 2.0,
         "step": 1046
     },
     {
+        "loss": 3.1858,
+        "grad_norm": 1.7496325969696045,
         "learning_rate": 0.0008859145952836202,
         "epoch": 2.026768642447419,
         "step": 1060
     },
     {
+        "loss": 3.0482,
+        "grad_norm": 1.7203223705291748,
         "learning_rate": 0.0008816656044189504,
         "epoch": 2.0650095602294454,
         "step": 1080
     },
     {
+        "loss": 3.0764,
+        "grad_norm": 1.6859164237976074,
         "learning_rate": 0.000877416613554281,
         "epoch": 2.1032504780114722,
         "step": 1100
     },
     {
+        "loss": 2.9918,
+        "grad_norm": 1.887332558631897,
         "learning_rate": 0.0008731676226896112,
         "epoch": 2.141491395793499,
         "step": 1120
     },
     {
+        "loss": 2.9791,
+        "grad_norm": 1.7712619304656982,
         "learning_rate": 0.0008689186318249415,
         "epoch": 2.179732313575526,
         "step": 1140
     },
     {
+        "loss": 2.9064,
+        "grad_norm": 1.8518322706222534,
         "learning_rate": 0.000864669640960272,
         "epoch": 2.2179732313575524,
         "step": 1160
     },
     {
+        "loss": 2.9346,
+        "grad_norm": 1.8636976480484009,
         "learning_rate": 0.0008604206500956023,
         "epoch": 2.2562141491395793,
         "step": 1180
     },
     {
+        "loss": 2.9154,
+        "grad_norm": 1.8007034063339233,
         "learning_rate": 0.0008561716592309326,
         "epoch": 2.294455066921606,
         "step": 1200
     },
     {
+        "loss": 2.8311,
+        "grad_norm": 1.8480207920074463,
         "learning_rate": 0.000851922668366263,
         "epoch": 2.332695984703633,
         "step": 1220
     },
     {
+        "loss": 2.843,
+        "grad_norm": 1.8463302850723267,
         "learning_rate": 0.0008476736775015934,
         "epoch": 2.3709369024856595,
         "step": 1240
     },
     {
+        "loss": 2.8704,
+        "grad_norm": 1.8563566207885742,
         "learning_rate": 0.0008434246866369237,
         "epoch": 2.4091778202676863,
         "step": 1260
     },
     {
+        "loss": 2.7794,
+        "grad_norm": 1.8388174772262573,
         "learning_rate": 0.0008391756957722541,
         "epoch": 2.447418738049713,
         "step": 1280
     },
     {
+        "loss": 2.7776,
+        "grad_norm": 1.787711262702942,
         "learning_rate": 0.0008349267049075845,
         "epoch": 2.48565965583174,
         "step": 1300
     },
     {
+        "loss": 2.7288,
+        "grad_norm": 1.6573237180709839,
         "learning_rate": 0.0008306777140429148,
         "epoch": 2.5239005736137665,
         "step": 1320
     },
     {
+        "loss": 2.6598,
+        "grad_norm": 1.8304985761642456,
         "learning_rate": 0.0008264287231782451,
         "epoch": 2.5621414913957934,
         "step": 1340
     },
     {
+        "loss": 2.6842,
+        "grad_norm": 1.769439458847046,
         "learning_rate": 0.0008221797323135756,
         "epoch": 2.6003824091778203,
         "step": 1360
     },
     {
+        "loss": 2.5974,
+        "grad_norm": 1.7404167652130127,
         "learning_rate": 0.0008179307414489059,
         "epoch": 2.638623326959847,
         "step": 1380
     },
     {
+        "loss": 2.6557,
+        "grad_norm": 1.7064534425735474,
         "learning_rate": 0.0008136817505842362,
         "epoch": 2.676864244741874,
         "step": 1400
     },
     {
+        "loss": 2.5701,
+        "grad_norm": 1.784652590751648,
         "learning_rate": 0.0008094327597195667,
         "epoch": 2.7151051625239004,
         "step": 1420
     },
     {
+        "loss": 2.5879,
+        "grad_norm": 1.730402946472168,
         "learning_rate": 0.000805183768854897,
         "epoch": 2.7533460803059273,
         "step": 1440
     },
     {
+        "loss": 2.5413,
+        "grad_norm": 1.803881049156189,
         "learning_rate": 0.0008009347779902273,
         "epoch": 2.791586998087954,
         "step": 1460
     },
     {
+        "loss": 2.4823,
+        "grad_norm": 1.7114533185958862,
         "learning_rate": 0.0007966857871255578,
         "epoch": 2.8298279158699806,
         "step": 1480
     },
     {
+        "loss": 2.4236,
+        "grad_norm": 1.7487016916275024,
         "learning_rate": 0.000792436796260888,
         "epoch": 2.8680688336520075,
         "step": 1500
     },
     {
+        "loss": 2.4762,
+        "grad_norm": 1.7806780338287354,
         "learning_rate": 0.0007881878053962183,
         "epoch": 2.9063097514340344,
         "step": 1520
     },
     {
+        "loss": 2.4221,
+        "grad_norm": 1.851486086845398,
         "learning_rate": 0.0007839388145315488,
         "epoch": 2.9445506692160612,
         "step": 1540
     },
     {
+        "loss": 2.4164,
+        "grad_norm": 1.779451608657837,
         "learning_rate": 0.0007796898236668791,
         "epoch": 2.982791586998088,
         "step": 1560
     },
     {
+        "eval_loss": 1.4862462282180786,
+        "eval_accuracy": 0.6816010763538514,
+        "eval_runtime": 715.5989,
+        "eval_samples_per_second": 20.773,
+        "eval_steps_per_second": 20.773,
         "epoch": 3.0,
         "step": 1569
     },
     {
+        "loss": 2.3348,
+        "grad_norm": 1.7617199420928955,
         "learning_rate": 0.0007754408328022094,
         "epoch": 3.0210325047801145,
         "step": 1580
     },
     {
+        "loss": 2.3196,
+        "grad_norm": 1.7850340604782104,
         "learning_rate": 0.0007711918419375399,
         "epoch": 3.0592734225621414,
         "step": 1600
     },
     {
+        "loss": 2.2991,
+        "grad_norm": 1.828715205192566,
         "learning_rate": 0.0007669428510728702,
         "epoch": 3.0975143403441683,
         "step": 1620
     },
     {
+        "loss": 2.2354,
+        "grad_norm": 1.7825413942337036,
         "learning_rate": 0.0007626938602082005,
         "epoch": 3.135755258126195,
         "step": 1640
     },
     {
+        "loss": 2.221,
+        "grad_norm": 1.8411946296691895,
         "learning_rate": 0.0007584448693435309,
         "epoch": 3.173996175908222,
         "step": 1660
     },
     {
+        "loss": 2.1939,
+        "grad_norm": 1.8236651420593262,
         "learning_rate": 0.0007541958784788613,
         "epoch": 3.2122370936902485,
         "step": 1680
     },
     {
+        "loss": 2.211,
+        "grad_norm": 1.8275988101959229,
         "learning_rate": 0.0007499468876141916,
         "epoch": 3.2504780114722753,
         "step": 1700
     },
     {
+        "loss": 2.1454,
+        "grad_norm": 1.7743233442306519,
         "learning_rate": 0.000745697896749522,
         "epoch": 3.288718929254302,
         "step": 1720
     },
     {
+        "loss": 2.1258,
+        "grad_norm": 1.7873393297195435,
         "learning_rate": 0.0007414489058848524,
         "epoch": 3.3269598470363286,
         "step": 1740
     },
     {
+        "loss": 2.1101,
+        "grad_norm": 1.8012022972106934,
         "learning_rate": 0.0007371999150201827,
         "epoch": 3.3652007648183555,
         "step": 1760
     },
     {
+        "loss": 2.1246,
+        "grad_norm": 1.8000600337982178,
         "learning_rate": 0.0007329509241555131,
         "epoch": 3.4034416826003824,
         "step": 1780
     },
     {
+        "loss": 2.1244,
+        "grad_norm": 1.7723950147628784,
         "learning_rate": 0.0007287019332908435,
         "epoch": 3.4416826003824093,
         "step": 1800
     },
     {
+        "loss": 2.1099,
+        "grad_norm": 1.8095979690551758,
         "learning_rate": 0.0007244529424261738,
         "epoch": 3.479923518164436,
         "step": 1820
     },
     {
+        "loss": 2.0703,
+        "grad_norm": 1.8022161722183228,
         "learning_rate": 0.0007202039515615042,
         "epoch": 3.5181644359464626,
         "step": 1840
     },
     {
+        "loss": 2.0937,
+        "grad_norm": 1.7775332927703857,
         "learning_rate": 0.0007159549606968346,
         "epoch": 3.5564053537284894,
         "step": 1860
     },
     {
+        "loss": 2.0305,
+        "grad_norm": 1.829291820526123,
         "learning_rate": 0.0007117059698321649,
         "epoch": 3.5946462715105163,
         "step": 1880
     },
     {
+        "loss": 2.0528,
+        "grad_norm": 1.731218934059143,
         "learning_rate": 0.0007074569789674953,
         "epoch": 3.632887189292543,
         "step": 1900
     },
     {
+        "loss": 2.0311,
+        "grad_norm": 1.9170475006103516,
         "learning_rate": 0.0007032079881028257,
         "epoch": 3.67112810707457,
         "step": 1920
     },
     {
+        "loss": 2.006,
+        "grad_norm": 1.6934610605239868,
         "learning_rate": 0.0006989589972381559,
         "epoch": 3.7093690248565965,
         "step": 1940
     },
     {
+        "loss": 1.9627,
+        "grad_norm": 1.792523741722107,
         "learning_rate": 0.0006947100063734863,
         "epoch": 3.7476099426386233,
         "step": 1960
     },
     {
+        "loss": 2.0141,
+        "grad_norm": 1.7618036270141602,
         "learning_rate": 0.0006904610155088166,
         "epoch": 3.78585086042065,
         "step": 1980
     },
     {
+        "loss": 1.9489,
+        "grad_norm": 1.7026081085205078,
         "learning_rate": 0.000686212024644147,
         "epoch": 3.8240917782026767,
         "step": 2000
     },
     {
+        "loss": 1.965,
+        "grad_norm": 1.7117011547088623,
         "learning_rate": 0.0006819630337794774,
         "epoch": 3.8623326959847035,
         "step": 2020
     },
     {
+        "loss": 1.9188,
+        "grad_norm": 1.7798806428909302,
         "learning_rate": 0.0006777140429148077,
         "epoch": 3.9005736137667304,
         "step": 2040
     },
     {
+        "loss": 1.9044,
+        "grad_norm": 1.7349345684051514,
         "learning_rate": 0.0006734650520501381,
         "epoch": 3.9388145315487573,
         "step": 2060
     },
     {
+        "loss": 1.8625,
+        "grad_norm": 1.8268795013427734,
         "learning_rate": 0.0006692160611854685,
         "epoch": 3.977055449330784,
         "step": 2080
     },
     {
+        "eval_loss": 0.9794349670410156,
+        "eval_accuracy": 0.7917255297679112,
+        "eval_runtime": 689.4289,
+        "eval_samples_per_second": 21.561,
+        "eval_steps_per_second": 21.561,
         "epoch": 4.0,
         "step": 2092
     },
     {
+        "loss": 1.8294,
+        "grad_norm": 1.6992357969284058,
         "learning_rate": 0.0006649670703207988,
         "epoch": 4.015296367112811,
         "step": 2100
     },
     {
+        "loss": 1.7572,
+        "grad_norm": 1.6715435981750488,
         "learning_rate": 0.0006607180794561292,
         "epoch": 4.053537284894838,
         "step": 2120
     },
     {
+        "loss": 1.7502,
+        "grad_norm": 1.7837462425231934,
         "learning_rate": 0.0006564690885914596,
         "epoch": 4.091778202676864,
         "step": 2140
     },
     {
+        "loss": 1.733,
+        "grad_norm": 1.7979024648666382,
         "learning_rate": 0.0006522200977267899,
         "epoch": 4.130019120458891,
         "step": 2160
     },
     {
+        "loss": 1.7437,
+        "grad_norm": 1.6781351566314697,
         "learning_rate": 0.0006479711068621203,
         "epoch": 4.168260038240918,
         "step": 2180
     },
     {
+        "loss": 1.7702,
+        "grad_norm": 1.7719937562942505,
         "learning_rate": 0.0006437221159974506,
         "epoch": 4.2065009560229445,
         "step": 2200
     },
     {
+        "loss": 1.7546,
+        "grad_norm": 1.7680734395980835,
         "learning_rate": 0.000639473125132781,
         "epoch": 4.244741873804971,
         "step": 2220
     },
     {
+        "loss": 1.6716,
+        "grad_norm": 1.7028470039367676,
         "learning_rate": 0.0006352241342681113,
         "epoch": 4.282982791586998,
         "step": 2240
     },
     {
+        "loss": 1.7316,
+        "grad_norm": 1.764496922492981,
         "learning_rate": 0.0006309751434034417,
         "epoch": 4.321223709369025,
         "step": 2260
     },
     {
+        "loss": 1.6865,
+        "grad_norm": 1.7339156866073608,
         "learning_rate": 0.0006267261525387721,
         "epoch": 4.359464627151052,
         "step": 2280
     },
     {
+        "loss": 1.6711,
+        "grad_norm": 1.6657025814056396,
         "learning_rate": 0.0006224771616741024,
         "epoch": 4.397705544933078,
         "step": 2300
     },
     {
+        "loss": 1.7128,
+        "grad_norm": 1.8127187490463257,
         "learning_rate": 0.0006182281708094328,
         "epoch": 4.435946462715105,
         "step": 2320
     },
     {
+        "loss": 1.6676,
+        "grad_norm": 1.720580816268921,
         "learning_rate": 0.0006139791799447631,
         "epoch": 4.474187380497132,
         "step": 2340
     },
     {
+        "loss": 1.6651,
+        "grad_norm": 1.7044484615325928,
         "learning_rate": 0.0006097301890800934,
         "epoch": 4.512428298279159,
         "step": 2360
     },
     {
+        "loss": 1.6743,
+        "grad_norm": 1.824859857559204,
         "learning_rate": 0.0006054811982154238,
         "epoch": 4.550669216061186,
         "step": 2380
     },
     {
+        "loss": 1.6765,
+        "grad_norm": 1.847652554512024,
         "learning_rate": 0.0006012322073507542,
         "epoch": 4.588910133843212,
         "step": 2400
     },
     {
+        "loss": 1.6257,
+        "grad_norm": 1.6747491359710693,
         "learning_rate": 0.0005969832164860845,
         "epoch": 4.627151051625239,
         "step": 2420
     },
     {
+        "loss": 1.6515,
+        "grad_norm": 1.7143758535385132,
         "learning_rate": 0.0005927342256214149,
         "epoch": 4.665391969407266,
         "step": 2440
     },
     {
+        "loss": 1.6384,
+        "grad_norm": 1.7966911792755127,
         "learning_rate": 0.0005884852347567453,
         "epoch": 4.7036328871892925,
         "step": 2460
     },
     {
+        "loss": 1.5939,
+        "grad_norm": 1.6942733526229858,
         "learning_rate": 0.0005842362438920756,
         "epoch": 4.741873804971319,
         "step": 2480
     },
     {
+        "loss": 1.5908,
+        "grad_norm": 1.7530827522277832,
         "learning_rate": 0.000579987253027406,
         "epoch": 4.780114722753346,
         "step": 2500
     },
     {
+        "loss": 1.5586,
+        "grad_norm": 1.845311164855957,
         "learning_rate": 0.0005757382621627364,
         "epoch": 4.818355640535373,
         "step": 2520
     },
     {
+        "loss": 1.5561,
+        "grad_norm": 1.7648016214370728,
         "learning_rate": 0.0005714892712980667,
         "epoch": 4.8565965583174,
         "step": 2540
     },
     {
+        "loss": 1.5641,
+        "grad_norm": 1.6392900943756104,
         "learning_rate": 0.000567240280433397,
         "epoch": 4.894837476099426,
         "step": 2560
     },
     {
+        "loss": 1.5443,
+        "grad_norm": 1.6668881177902222,
         "learning_rate": 0.0005629912895687275,
         "epoch": 4.933078393881453,
         "step": 2580
     },
     {
+        "loss": 1.5637,
+        "grad_norm": 1.8424748182296753,
         "learning_rate": 0.0005587422987040578,
         "epoch": 4.97131931166348,
         "step": 2600
     },
     {
+        "eval_loss": 0.7047534584999084,
+        "eval_accuracy": 0.8490413723511604,
+        "eval_runtime": 7835.8179,
+        "eval_samples_per_second": 1.897,
+        "eval_steps_per_second": 1.897,
         "epoch": 5.0,
         "step": 2615
     },
     {
+        "loss": 1.4799,
+        "grad_norm": 1.8417068719863892,
         "learning_rate": 0.0005544933078393881,
         "epoch": 5.009560229445507,
         "step": 2620
     },
     {
+        "loss": 1.4184,
+        "grad_norm": 1.6466395854949951,
         "learning_rate": 0.0005502443169747186,
         "epoch": 5.047801147227533,
         "step": 2640
     },
     {
+        "loss": 1.4507,
+        "grad_norm": 1.7499247789382935,
         "learning_rate": 0.0005459953261100489,
         "epoch": 5.08604206500956,
         "step": 2660
     },
     {
+        "loss": 1.405,
+        "grad_norm": 1.7968547344207764,
         "learning_rate": 0.0005417463352453792,
         "epoch": 5.124282982791587,
         "step": 2680
     },
     {
+        "loss": 1.4362,
+        "grad_norm": 1.7950819730758667,
         "learning_rate": 0.0005374973443807097,
         "epoch": 5.162523900573614,
         "step": 2700
     },
     {
+        "loss": 1.4238,
+        "grad_norm": 1.745133399963379,
         "learning_rate": 0.00053324835351604,
         "epoch": 5.2007648183556405,
         "step": 2720
     },
     {
+        "loss": 1.4025,
+        "grad_norm": 1.7767413854599,
         "learning_rate": 0.0005289993626513702,
         "epoch": 5.239005736137667,
         "step": 2740
     },
     {
+        "loss": 1.4144,
+        "grad_norm": 1.7297043800354004,
         "learning_rate": 0.0005247503717867008,
         "epoch": 5.277246653919694,
         "step": 2760
     },
     {
+        "loss": 1.4071,
+        "grad_norm": 1.8174902200698853,
         "learning_rate": 0.000520501380922031,
         "epoch": 5.315487571701721,
         "step": 2780
     },
     {
+        "loss": 1.4022,
+        "grad_norm": 1.6889333724975586,
         "learning_rate": 0.0005162523900573613,
         "epoch": 5.353728489483748,
         "step": 2800
     },
     {
+        "loss": 1.3479,
+        "grad_norm": 1.6331517696380615,
         "learning_rate": 0.0005120033991926918,
         "epoch": 5.3919694072657744,
         "step": 2820
     },
     {
+        "loss": 1.362,
+        "grad_norm": 1.8916860818862915,
         "learning_rate": 0.0005077544083280221,
         "epoch": 5.430210325047801,
         "step": 2840
     },
     {
+        "loss": 1.3613,
+        "grad_norm": 1.706222653388977,
         "learning_rate": 0.0005035054174633524,
         "epoch": 5.468451242829828,
         "step": 2860
     },
     {
+        "loss": 1.3883,
+        "grad_norm": 1.6761025190353394,
         "learning_rate": 0.0004992564265986828,
         "epoch": 5.506692160611855,
         "step": 2880
     },
     {
+        "loss": 1.3432,
+        "grad_norm": 1.632095217704773,
         "learning_rate": 0.0004950074357340132,
         "epoch": 5.544933078393882,
         "step": 2900
     },
     {
+        "loss": 1.3417,
+        "grad_norm": 1.6419159173965454,
         "learning_rate": 0.0004907584448693436,
         "epoch": 5.583173996175908,
         "step": 2920
     },
     {
+        "loss": 1.341,
+        "grad_norm": 1.8355722427368164,
         "learning_rate": 0.0004865094540046739,
         "epoch": 5.621414913957935,
         "step": 2940
     },
     {
+        "loss": 1.339,
+        "grad_norm": 1.6611793041229248,
         "learning_rate": 0.00048226046314000425,
         "epoch": 5.659655831739962,
         "step": 2960
     },
     {
+        "loss": 1.3183,
+        "grad_norm": 1.7696843147277832,
         "learning_rate": 0.0004780114722753346,
         "epoch": 5.6978967495219885,
         "step": 2980
     },
     {
+        "loss": 1.3365,
+        "grad_norm": 1.6689785718917847,
         "learning_rate": 0.000473762481410665,
         "epoch": 5.736137667304015,
         "step": 3000
     },
     {
+        "loss": 1.2966,
+        "grad_norm": 1.6962292194366455,
         "learning_rate": 0.00046951349054599533,
         "epoch": 5.774378585086042,
         "step": 3020
     },
     {
+        "loss": 1.274,
+        "grad_norm": 1.7446441650390625,
         "learning_rate": 0.0004652644996813257,
         "epoch": 5.812619502868069,
         "step": 3040
     },
     {
+        "loss": 1.3145,
+        "grad_norm": 1.7083852291107178,
         "learning_rate": 0.0004610155088166561,
         "epoch": 5.850860420650095,
         "step": 3060
     },
     {
+        "loss": 1.2944,
+        "grad_norm": 1.6674286127090454,
         "learning_rate": 0.0004567665179519864,
         "epoch": 5.8891013384321225,
         "step": 3080
     },
     {
+        "loss": 1.2772,
+        "grad_norm": 1.5787798166275024,
         "learning_rate": 0.00045251752708731676,
         "epoch": 5.927342256214149,
         "step": 3100
     },
     {
+        "loss": 1.265,
+        "grad_norm": 1.6089515686035156,
         "learning_rate": 0.0004482685362226471,
         "epoch": 5.965583173996176,
         "step": 3120
     },
     {
+        "eval_loss": 0.5389042496681213,
+        "eval_accuracy": 0.886242852337706,
+        "eval_runtime": 383.2829,
+        "eval_samples_per_second": 38.783,
+        "eval_steps_per_second": 38.783,
         "epoch": 6.0,
         "step": 3138
     },
     {
+        "loss": 1.2601,
+        "grad_norm": 1.5683554410934448,
         "learning_rate": 0.0004440195453579775,
         "epoch": 6.003824091778203,
         "step": 3140
     },
     {
+        "loss": 1.2067,
+        "grad_norm": 1.6791157722473145,
         "learning_rate": 0.00043977055449330785,
         "epoch": 6.042065009560229,
         "step": 3160
     },
     {
+        "loss": 1.1818,
+        "grad_norm": 1.5930650234222412,
         "learning_rate": 0.0004355215636286382,
         "epoch": 6.080305927342256,
         "step": 3180
     },
     {
+        "loss": 1.1871,
+        "grad_norm": 1.7136551141738892,
         "learning_rate": 0.0004312725727639686,
         "epoch": 6.118546845124283,
         "step": 3200
     },
     {
+        "loss": 1.1851,
+        "grad_norm": 1.6400994062423706,
         "learning_rate": 0.0004270235818992989,
         "epoch": 6.15678776290631,
         "step": 3220
     },
     {
+        "loss": 1.1798,
+        "grad_norm": 1.722548246383667,
         "learning_rate": 0.0004227745910346293,
         "epoch": 6.195028680688337,
         "step": 3240
     },
     {
+        "loss": 1.1516,
+        "grad_norm": 1.600697636604309,
         "learning_rate": 0.0004185256001699597,
         "epoch": 6.233269598470363,
         "step": 3260
     },
     {
+        "loss": 1.176,
+        "grad_norm": 1.6722103357315063,
         "learning_rate": 0.00041427660930528997,
         "epoch": 6.27151051625239,
         "step": 3280
     },
     {
+        "loss": 1.1625,
+        "grad_norm": 1.5297291278839111,
         "learning_rate": 0.00041002761844062037,
         "epoch": 6.309751434034417,
         "step": 3300
     },
     {
+        "loss": 1.1744,
+        "grad_norm": 1.6687546968460083,
         "learning_rate": 0.00040577862757595076,
         "epoch": 6.347992351816444,
         "step": 3320
     },
     {
+        "loss": 1.1408,
+        "grad_norm": 1.6758590936660767,
         "learning_rate": 0.00040152963671128105,
         "epoch": 6.3862332695984705,
         "step": 3340
     },
     {
+        "loss": 1.1458,
+        "grad_norm": 1.7506797313690186,
         "learning_rate": 0.00039728064584661145,
         "epoch": 6.424474187380497,
         "step": 3360
     },
     {
+        "loss": 1.1579,
+        "grad_norm": 1.7690140008926392,
         "learning_rate": 0.0003930316549819418,
         "epoch": 6.462715105162524,
         "step": 3380
     },
     {
+        "loss": 1.1444,
+        "grad_norm": 1.7732901573181152,
         "learning_rate": 0.00038878266411727214,
         "epoch": 6.500956022944551,
         "step": 3400
     },
     {
+        "loss": 1.1431,
+        "grad_norm": 1.7551547288894653,
         "learning_rate": 0.00038453367325260254,
         "epoch": 6.539196940726577,
         "step": 3420
     },
     {
+        "loss": 1.1293,
+        "grad_norm": 1.6275290250778198,
         "learning_rate": 0.0003802846823879329,
         "epoch": 6.577437858508604,
         "step": 3440
     },
     {
+        "loss": 1.1418,
+        "grad_norm": 1.769103765487671,
         "learning_rate": 0.0003760356915232632,
         "epoch": 6.615678776290631,
         "step": 3460
     },
     {
+        "loss": 1.1305,
+        "grad_norm": 1.7487330436706543,
         "learning_rate": 0.0003717867006585936,
         "epoch": 6.653919694072657,
         "step": 3480
     },
     {
+        "loss": 1.0805,
+        "grad_norm": 1.698512315750122,
         "learning_rate": 0.0003675377097939239,
         "epoch": 6.692160611854685,
         "step": 3500
     },
     {
+        "loss": 1.0907,
+        "grad_norm": 1.6636496782302856,
         "learning_rate": 0.0003632887189292543,
         "epoch": 6.730401529636711,
         "step": 3520
     },
     {
+        "loss": 1.1399,
+        "grad_norm": 1.577497959136963,
         "learning_rate": 0.00035903972806458466,
         "epoch": 6.768642447418738,
         "step": 3540
     },
     {
+        "loss": 1.1206,
+        "grad_norm": 1.7101361751556396,
         "learning_rate": 0.000354790737199915,
         "epoch": 6.806883365200765,
         "step": 3560
     },
     {
+        "loss": 1.103,
+        "grad_norm": 1.6473299264907837,
         "learning_rate": 0.0003505417463352454,
         "epoch": 6.845124282982791,
         "step": 3580
     },
     {
+        "loss": 1.1088,
+        "grad_norm": 1.6744282245635986,
         "learning_rate": 0.00034629275547057574,
         "epoch": 6.8833652007648185,
         "step": 3600
     },
     {
+        "loss": 1.0857,
+        "grad_norm": 1.67130708694458,
         "learning_rate": 0.0003420437646059061,
         "epoch": 6.921606118546845,
         "step": 3620
     },
     {
         "loss": 1.0912,
+        "grad_norm": 1.6932523250579834,
         "learning_rate": 0.0003377947737412365,
         "epoch": 6.959847036328872,
         "step": 3640
     },
     {
+        "loss": 1.0888,
+        "grad_norm": 1.6580239534378052,
         "learning_rate": 0.00033354578287656683,
         "epoch": 6.998087954110899,
         "step": 3660
     },
     {
+        "eval_loss": 0.4364229142665863,
+        "eval_accuracy": 0.9101244534140599,
+        "eval_runtime": 605.3795,
+        "eval_samples_per_second": 24.555,
+        "eval_steps_per_second": 24.555,
         "epoch": 7.0,
         "step": 3661
     },
     {
+        "loss": 1.0123,
+        "grad_norm": 1.5949829816818237,
         "learning_rate": 0.0003292967920118972,
         "epoch": 7.036328871892925,
         "step": 3680
     },
     {
+        "loss": 1.0552,
+        "grad_norm": 1.8134639263153076,
         "learning_rate": 0.0003250478011472275,
         "epoch": 7.074569789674952,
         "step": 3700
     },
     {
+        "loss": 1.0142,
+        "grad_norm": 1.6394524574279785,
         "learning_rate": 0.0003207988102825579,
         "epoch": 7.112810707456979,
         "step": 3720
     },
     {
+        "loss": 1.0096,
+        "grad_norm": 1.6918762922286987,
         "learning_rate": 0.00031654981941788826,
         "epoch": 7.151051625239006,
         "step": 3740
     },
     {
+        "loss": 1.0203,
+        "grad_norm": 1.673691987991333,
         "learning_rate": 0.0003123008285532186,
         "epoch": 7.189292543021033,
         "step": 3760
     },
     {
+        "loss": 1.049,
+        "grad_norm": 1.5526095628738403,
         "learning_rate": 0.000308051837688549,
         "epoch": 7.227533460803059,
         "step": 3780
     },
     {
+        "loss": 1.0247,
+        "grad_norm": 1.638197660446167,
         "learning_rate": 0.00030380284682387935,
         "epoch": 7.265774378585086,
         "step": 3800
     },
     {
+        "loss": 0.9841,
+        "grad_norm": 1.6690630912780762,
         "learning_rate": 0.0002995538559592097,
         "epoch": 7.304015296367113,
         "step": 3820
     },
     {
+        "loss": 1.018,
+        "grad_norm": 1.645591139793396,
         "learning_rate": 0.0002953048650945401,
         "epoch": 7.342256214149139,
         "step": 3840
     },
     {
+        "loss": 0.9818,
+        "grad_norm": 1.676079273223877,
         "learning_rate": 0.0002910558742298704,
         "epoch": 7.3804971319311665,
         "step": 3860
     },
     {
+        "loss": 0.9795,
+        "grad_norm": 1.6065680980682373,
         "learning_rate": 0.0002868068833652008,
         "epoch": 7.418738049713193,
         "step": 3880
     },
     {
+        "loss": 0.9588,
+        "grad_norm": 1.683929443359375,
         "learning_rate": 0.0002825578925005312,
         "epoch": 7.45697896749522,
         "step": 3900
     },
     {
+        "loss": 1.0081,
+        "grad_norm": 1.6200690269470215,
         "learning_rate": 0.00027830890163586146,
         "epoch": 7.495219885277247,
         "step": 3920
     },
     {
+        "loss": 0.9822,
+        "grad_norm": 1.7147966623306274,
         "learning_rate": 0.00027405991077119186,
         "epoch": 7.533460803059273,
         "step": 3940
     },
     {
+        "loss": 0.988,
+        "grad_norm": 1.7224268913269043,
         "learning_rate": 0.00026981091990652226,
         "epoch": 7.5717017208413,
         "step": 3960
     },
     {
+        "loss": 0.9562,
+        "grad_norm": 1.7145981788635254,
         "learning_rate": 0.00026556192904185255,
         "epoch": 7.609942638623327,
         "step": 3980
     },
     {
+        "loss": 0.9644,
+        "grad_norm": 1.8020603656768799,
         "learning_rate": 0.00026131293817718295,
         "epoch": 7.648183556405353,
         "step": 4000
     },
     {
+        "loss": 0.9648,
+        "grad_norm": 1.7413355112075806,
         "learning_rate": 0.00025706394731251324,
         "epoch": 7.686424474187381,
         "step": 4020
     },
     {
+        "loss": 0.9244,
+        "grad_norm": 1.6813682317733765,
         "learning_rate": 0.00025281495644784364,
         "epoch": 7.724665391969407,
         "step": 4040
     },
     {
+        "loss": 0.9217,
+        "grad_norm": 1.747910737991333,
         "learning_rate": 0.00024856596558317403,
         "epoch": 7.762906309751434,
         "step": 4060
     },
     {
+        "loss": 0.9628,
+        "grad_norm": 1.6242161989212036,
         "learning_rate": 0.0002443169747185044,
         "epoch": 7.801147227533461,
         "step": 4080
     },
     {
+        "loss": 0.979,
+        "grad_norm": 1.5416340827941895,
         "learning_rate": 0.00024006798385383472,
         "epoch": 7.839388145315487,
         "step": 4100
     },
     {
+        "loss": 0.9528,
+        "grad_norm": 1.7438323497772217,
         "learning_rate": 0.00023581899298916507,
         "epoch": 7.8776290630975145,
         "step": 4120
     },
     {
+        "loss": 0.91,
+        "grad_norm": 1.6303768157958984,
         "learning_rate": 0.00023157000212449544,
         "epoch": 7.915869980879541,
         "step": 4140
     },
     {
+        "loss": 0.9346,
+        "grad_norm": 1.586729884147644,
         "learning_rate": 0.00022732101125982578,
         "epoch": 7.954110898661568,
         "step": 4160
     },
     {
+        "loss": 0.9296,
+        "grad_norm": 1.771173357963562,
         "learning_rate": 0.00022307202039515615,
         "epoch": 7.992351816443595,
         "step": 4180
     },
     {
+        "eval_loss": 0.36169418692588806,
+        "eval_accuracy": 0.9264715775311133,
+        "eval_runtime": 57.1398,
+        "eval_samples_per_second": 260.152,
+        "eval_steps_per_second": 260.152,
         "epoch": 8.0,
         "step": 4184
     },
     {
+        "loss": 0.9005,
+        "grad_norm": 1.5647226572036743,
         "learning_rate": 0.00021882302953048652,
         "epoch": 8.030592734225621,
         "step": 4200
     },
     {
+        "loss": 0.8696,
+        "grad_norm": 1.6461886167526245,
         "learning_rate": 0.00021457403866581687,
         "epoch": 8.068833652007648,
         "step": 4220
     },
     {
+        "loss": 0.8782,
+        "grad_norm": 1.5358296632766724,
         "learning_rate": 0.0002103250478011472,
         "epoch": 8.107074569789676,
         "step": 4240
     },
     {
+        "loss": 0.8483,
+        "grad_norm": 1.6051462888717651,
         "learning_rate": 0.0002060760569364776,
         "epoch": 8.145315487571702,
         "step": 4260
     },
     {
+        "loss": 0.8817,
+        "grad_norm": 1.7184685468673706,
         "learning_rate": 0.00020182706607180795,
         "epoch": 8.183556405353729,
         "step": 4280
     },
     {
+        "loss": 0.8603,
+        "grad_norm": 1.6134257316589355,
         "learning_rate": 0.0001975780752071383,
         "epoch": 8.221797323135755,
         "step": 4300
     },
     {
+        "loss": 0.8654,
+        "grad_norm": 1.5783709287643433,
         "learning_rate": 0.00019332908434246867,
         "epoch": 8.260038240917781,
         "step": 4320
     },
     {
+        "loss": 0.8554,
+        "grad_norm": 1.4778318405151367,
         "learning_rate": 0.00018908009347779904,
         "epoch": 8.29827915869981,
         "step": 4340
     },
     {
+        "loss": 0.8974,
+        "grad_norm": 1.8124628067016602,
         "learning_rate": 0.00018483110261312938,
         "epoch": 8.336520076481836,
         "step": 4360
     },
     {
+        "loss": 0.8716,
+        "grad_norm": 1.7594116926193237,
         "learning_rate": 0.00018058211174845973,
         "epoch": 8.374760994263863,
         "step": 4380
     },
     {
+        "loss": 0.8525,
+        "grad_norm": 1.7039234638214111,
         "learning_rate": 0.0001763331208837901,
         "epoch": 8.413001912045889,
         "step": 4400
     },
     {
+        "loss": 0.8663,
+        "grad_norm": 1.6325209140777588,
         "learning_rate": 0.00017208413001912047,
         "epoch": 8.451242829827915,
         "step": 4420
     },
     {
+        "loss": 0.8629,
+        "grad_norm": 1.6818372011184692,
         "learning_rate": 0.00016783513915445082,
         "epoch": 8.489483747609942,
         "step": 4440
     },
     {
+        "loss": 0.8718,
+        "grad_norm": 1.5809085369110107,
         "learning_rate": 0.00016358614828978119,
         "epoch": 8.52772466539197,
         "step": 4460
     },
     {
+        "loss": 0.8527,
+        "grad_norm": 1.5711621046066284,
         "learning_rate": 0.00015933715742511153,
         "epoch": 8.565965583173996,
         "step": 4480
     },
     {
+        "loss": 0.8589,
+        "grad_norm": 1.5462543964385986,
         "learning_rate": 0.0001550881665604419,
         "epoch": 8.604206500956023,
         "step": 4500
     },
     {
+        "loss": 0.8649,
+        "grad_norm": 1.6341811418533325,
         "learning_rate": 0.00015083917569577227,
         "epoch": 8.64244741873805,
         "step": 4520
     },
     {
+        "loss": 0.8192,
+        "grad_norm": 1.5172038078308105,
         "learning_rate": 0.00014659018483110262,
         "epoch": 8.680688336520076,
         "step": 4540
     },
     {
+        "loss": 0.8436,
+        "grad_norm": 1.4799879789352417,
         "learning_rate": 0.00014234119396643296,
         "epoch": 8.718929254302104,
         "step": 4560
     },
     {
+        "loss": 0.8292,
+        "grad_norm": 1.547850251197815,
         "learning_rate": 0.00013809220310176336,
         "epoch": 8.75717017208413,
         "step": 4580
     },
     {
+        "loss": 0.8527,
+        "grad_norm": 1.8095018863677979,
         "learning_rate": 0.0001338432122370937,
         "epoch": 8.795411089866157,
         "step": 4600
     },
     {
+        "loss": 0.8358,
+        "grad_norm": 1.5578687191009521,
         "learning_rate": 0.00012959422137242405,
         "epoch": 8.833652007648183,
         "step": 4620
     },
     {
+        "loss": 0.822,
+        "grad_norm": 1.7008335590362549,
         "learning_rate": 0.0001253452305077544,
         "epoch": 8.87189292543021,
         "step": 4640
     },
     {
+        "loss": 0.8352,
+        "grad_norm": 1.6717548370361328,
         "learning_rate": 0.00012109623964308478,
         "epoch": 8.910133843212238,
         "step": 4660
     },
     {
+        "loss": 0.826,
+        "grad_norm": 1.729179859161377,
         "learning_rate": 0.00011684724877841513,
         "epoch": 8.948374760994264,
         "step": 4680
     },
     {
+        "loss": 0.8066,
+        "grad_norm": 1.6358789205551147,
         "learning_rate": 0.00011259825791374549,
         "epoch": 8.98661567877629,
         "step": 4700
     },
     {
+        "eval_loss": 0.3206591010093689,
         "eval_accuracy": 0.9352842246888665,
+        "eval_runtime": 381.3245,
+        "eval_samples_per_second": 38.983,
+        "eval_steps_per_second": 38.983,
         "epoch": 9.0,
         "step": 4707
     },
     {
+        "loss": 0.8019,
+        "grad_norm": 1.5783698558807373,
         "learning_rate": 0.00010834926704907585,
         "epoch": 9.024856596558317,
         "step": 4720
     },
     {
+        "loss": 0.7702,
+        "grad_norm": 1.5668120384216309,
         "learning_rate": 0.0001041002761844062,
         "epoch": 9.063097514340344,
         "step": 4740
     },
     {
+        "loss": 0.7934,
+        "grad_norm": 1.624084234237671,
         "learning_rate": 9.985128531973658e-05,
         "epoch": 9.101338432122372,
         "step": 4760
     },
     {
+        "loss": 0.7863,
+        "grad_norm": 1.5410951375961304,
         "learning_rate": 9.560229445506692e-05,
         "epoch": 9.139579349904398,
         "step": 4780
     },
     {
+        "loss": 0.7894,
+        "grad_norm": 1.663845419883728,
         "learning_rate": 9.135330359039729e-05,
         "epoch": 9.177820267686425,
         "step": 4800
     },
     {
+        "loss": 0.739,
+        "grad_norm": 1.5939579010009766,
         "learning_rate": 8.710431272572764e-05,
         "epoch": 9.216061185468451,
         "step": 4820
     },
     {
+        "loss": 0.7808,
+        "grad_norm": 1.5545909404754639,
         "learning_rate": 8.2855321861058e-05,
         "epoch": 9.254302103250478,
         "step": 4840
     },
     {
+        "loss": 0.7761,
+        "grad_norm": 1.665999412536621,
         "learning_rate": 7.860633099638836e-05,
         "epoch": 9.292543021032504,
         "step": 4860
     },
     {
+        "loss": 0.7876,
+        "grad_norm": 1.6480567455291748,
         "learning_rate": 7.435734013171871e-05,
         "epoch": 9.330783938814532,
         "step": 4880
     },
     {
+        "loss": 0.767,
+        "grad_norm": 1.5779589414596558,
         "learning_rate": 7.010834926704908e-05,
         "epoch": 9.369024856596559,
         "step": 4900
     },
     {
+        "loss": 0.783,
+        "grad_norm": 1.6985348463058472,
         "learning_rate": 6.585935840237942e-05,
         "epoch": 9.407265774378585,
         "step": 4920
     },
     {
+        "loss": 0.756,
+        "grad_norm": 1.5563093423843384,
         "learning_rate": 6.16103675377098e-05,
         "epoch": 9.445506692160611,
         "step": 4940
     },
     {
+        "loss": 0.7682,
+        "grad_norm": 1.6173079013824463,
         "learning_rate": 5.736137667304015e-05,
         "epoch": 9.483747609942638,
         "step": 4960
     },
     {
+        "loss": 0.7632,
+        "grad_norm": 1.5880271196365356,
         "learning_rate": 5.311238580837052e-05,
         "epoch": 9.521988527724666,
         "step": 4980
     },
     {
+        "loss": 0.7602,
+        "grad_norm": 1.6329987049102783,
         "learning_rate": 4.8863394943700874e-05,
         "epoch": 9.560229445506693,
         "step": 5000
     },
     {
+        "loss": 0.7595,
+        "grad_norm": 1.442744255065918,
         "learning_rate": 4.461440407903123e-05,
         "epoch": 9.598470363288719,
         "step": 5020
     },
     {
+        "loss": 0.7621,
+        "grad_norm": 1.5359572172164917,
         "learning_rate": 4.036541321436159e-05,
         "epoch": 9.636711281070745,
         "step": 5040
     },
     {
+        "loss": 0.7717,
+        "grad_norm": 1.6465296745300293,
         "learning_rate": 3.6116422349691954e-05,
         "epoch": 9.674952198852772,
         "step": 5060
     },
     {
+        "loss": 0.73,
+        "grad_norm": 1.6590745449066162,
         "learning_rate": 3.186743148502231e-05,
         "epoch": 9.7131931166348,
         "step": 5080
     },
     {
+        "loss": 0.7486,
+        "grad_norm": 1.5176348686218262,
         "learning_rate": 2.7618440620352666e-05,
         "epoch": 9.751434034416826,
         "step": 5100
     },
     {
+        "loss": 0.7766,
+        "grad_norm": 1.7158029079437256,
         "learning_rate": 2.3369449755683023e-05,
         "epoch": 9.789674952198853,
         "step": 5120
     },
     {
+        "loss": 0.7159,
+        "grad_norm": 1.6565515995025635,
         "learning_rate": 1.9120458891013384e-05,
         "epoch": 9.82791586998088,
         "step": 5140
     },
     {
+        "loss": 0.7304,
+        "grad_norm": 1.4815343618392944,
         "learning_rate": 1.4871468026343743e-05,
         "epoch": 9.866156787762906,
         "step": 5160
     },
     {
+        "loss": 0.744,
+        "grad_norm": 1.6041672229766846,
         "learning_rate": 1.0622477161674103e-05,
         "epoch": 9.904397705544934,
         "step": 5180
     },
     {
+        "loss": 0.7562,
+        "grad_norm": 1.6224092245101929,
         "learning_rate": 6.373486297004461e-06,
         "epoch": 9.94263862332696,
         "step": 5200
     },
     {
+        "loss": 0.7675,
+        "grad_norm": 1.5392311811447144,
         "learning_rate": 2.1244954323348204e-06,
         "epoch": 9.980879541108987,
         "step": 5220
     },
     {
+        "eval_loss": 0.29811325669288635,
+        "eval_accuracy": 0.9405314497140935,
+        "eval_runtime": 464.2204,
+        "eval_samples_per_second": 32.021,
+        "eval_steps_per_second": 32.021,
         "epoch": 10.0,
         "step": 5230
     },
     {
+        "train_runtime": 50832.5393,
+        "train_samples_per_second": 26.317,
+        "train_steps_per_second": 0.103,
         "total_flos": 1.96318398191328e+18,
+        "train_loss": 2.122584131525306,
         "epoch": 10.0,
         "step": 5230
     }

train_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
     "epoch": 10.0,
     "total_flos": 1.96318398191328e+18,
-    "train_loss": 2.1099621225725396,
-    "train_runtime": 19361.582,
-    "train_samples_per_second": 69.094,
-    "train_steps_per_second": 0.27
 }

 {
     "epoch": 10.0,
     "total_flos": 1.96318398191328e+18,
+    "train_loss": 2.122584131525306,
+    "train_runtime": 50832.5393,
+    "train_samples_per_second": 26.317,
+    "train_steps_per_second": 0.103
 }

trainer_state.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "best_metric": 0.9410023545240498,
   "best_model_checkpoint": "/mnt/data4_HDD_14TB/yang/voxceleb-checkpoints/xvector/voxceleb1/finetune/ce-len3-bs256-lr1e-3/checkpoint-5230",
   "epoch": 10.0,
   "eval_steps": 500,
@@ -10,1929 +10,1929 @@
   "log_history": [
     {
       "epoch": 0.03824091778202677,
-      "grad_norm": 4.7170538902282715,
       "learning_rate": 3.824091778202677e-05,
-      "loss": 7.1456,
       "step": 20
     },
     {
       "epoch": 0.07648183556405354,
-      "grad_norm": 4.5134358406066895,
       "learning_rate": 7.648183556405354e-05,
-      "loss": 7.1158,
       "step": 40
     },
     {
       "epoch": 0.1147227533460803,
-      "grad_norm": 3.9098806381225586,
       "learning_rate": 0.0001147227533460803,
-      "loss": 7.0575,
       "step": 60
     },
     {
       "epoch": 0.15296367112810708,
-      "grad_norm": 3.2200050354003906,
       "learning_rate": 0.00015296367112810707,
-      "loss": 6.9743,
       "step": 80
     },
     {
       "epoch": 0.19120458891013384,
-      "grad_norm": 2.5479934215545654,
       "learning_rate": 0.00019120458891013384,
-      "loss": 6.874,
       "step": 100
     },
     {
       "epoch": 0.2294455066921606,
-      "grad_norm": 2.168301820755005,
       "learning_rate": 0.0002294455066921606,
-      "loss": 6.7278,
       "step": 120
     },
     {
       "epoch": 0.2676864244741874,
-      "grad_norm": 2.0083394050598145,
       "learning_rate": 0.0002676864244741874,
-      "loss": 6.5778,
       "step": 140
     },
     {
       "epoch": 0.30592734225621415,
-      "grad_norm": 1.8298897743225098,
       "learning_rate": 0.00030592734225621415,
-      "loss": 6.4192,
       "step": 160
     },
     {
       "epoch": 0.3441682600382409,
-      "grad_norm": 1.782423734664917,
       "learning_rate": 0.00034416826003824094,
-      "loss": 6.2416,
       "step": 180
     },
     {
       "epoch": 0.3824091778202677,
-      "grad_norm": 1.8139146566390991,
       "learning_rate": 0.0003824091778202677,
-      "loss": 6.0624,
       "step": 200
     },
     {
       "epoch": 0.42065009560229444,
-      "grad_norm": 1.7659958600997925,
       "learning_rate": 0.0004206500956022944,
-      "loss": 5.9686,
       "step": 220
     },
     {
       "epoch": 0.4588910133843212,
-      "grad_norm": 1.8660094738006592,
       "learning_rate": 0.0004588910133843212,
-      "loss": 5.8142,
       "step": 240
     },
     {
       "epoch": 0.497131931166348,
-      "grad_norm": 1.831566333770752,
       "learning_rate": 0.0004971319311663481,
-      "loss": 5.6944,
       "step": 260
     },
     {
       "epoch": 0.5353728489483748,
-      "grad_norm": 1.7546241283416748,
       "learning_rate": 0.0005353728489483748,
-      "loss": 5.6101,
       "step": 280
     },
     {
       "epoch": 0.5736137667304015,
-      "grad_norm": 1.8890600204467773,
       "learning_rate": 0.0005736137667304016,
-      "loss": 5.5192,
       "step": 300
     },
     {
       "epoch": 0.6118546845124283,
-      "grad_norm": 1.7542874813079834,
       "learning_rate": 0.0006118546845124283,
-      "loss": 5.3822,
       "step": 320
     },
     {
       "epoch": 0.6500956022944551,
-      "grad_norm": 1.8762731552124023,
       "learning_rate": 0.000650095602294455,
-      "loss": 5.3236,
       "step": 340
     },
     {
       "epoch": 0.6883365200764818,
-      "grad_norm": 1.886903166770935,
       "learning_rate": 0.0006883365200764819,
-      "loss": 5.2483,
       "step": 360
     },
     {
       "epoch": 0.7265774378585086,
-      "grad_norm": 1.9873583316802979,
       "learning_rate": 0.0007265774378585086,
-      "loss": 5.1245,
       "step": 380
     },
     {
       "epoch": 0.7648183556405354,
-      "grad_norm": 1.953506350517273,
       "learning_rate": 0.0007648183556405354,
-      "loss": 5.0771,
       "step": 400
     },
     {
       "epoch": 0.8030592734225621,
-      "grad_norm": 1.851192831993103,
       "learning_rate": 0.0008030592734225621,
-      "loss": 5.0354,
       "step": 420
     },
     {
       "epoch": 0.8413001912045889,
-      "grad_norm": 1.861971139907837,
       "learning_rate": 0.0008413001912045888,
-      "loss": 4.9532,
       "step": 440
     },
     {
       "epoch": 0.8795411089866156,
-      "grad_norm": 1.9388970136642456,
       "learning_rate": 0.0008795411089866157,
-      "loss": 4.8698,
       "step": 460
     },
     {
       "epoch": 0.9177820267686424,
-      "grad_norm": 1.919184684753418,
       "learning_rate": 0.0009177820267686424,
-      "loss": 4.847,
       "step": 480
     },
     {
       "epoch": 0.9560229445506692,
-      "grad_norm": 1.8886795043945312,
       "learning_rate": 0.0009560229445506692,
-      "loss": 4.7754,
       "step": 500
     },
     {
       "epoch": 0.994263862332696,
-      "grad_norm": 1.9832813739776611,
       "learning_rate": 0.0009942638623326961,
-      "loss": 4.6728,
       "step": 520
     },
     {
       "epoch": 1.0,
-      "eval_accuracy": 0.15042045072317525,
-      "eval_loss": 4.345639228820801,
-      "eval_runtime": 211.2136,
-      "eval_samples_per_second": 70.379,
-      "eval_steps_per_second": 70.379,
       "step": 523
     },
     {
       "epoch": 1.0325047801147227,
-      "grad_norm": 1.831260323524475,
       "learning_rate": 0.0009963883577650309,
-      "loss": 4.5657,
       "step": 540
     },
     {
       "epoch": 1.0707456978967496,
-      "grad_norm": 1.934414029121399,
       "learning_rate": 0.0009921393669003612,
-      "loss": 4.5047,
       "step": 560
     },
     {
       "epoch": 1.1089866156787762,
-      "grad_norm": 1.7718721628189087,
       "learning_rate": 0.0009878903760356915,
-      "loss": 4.4165,
       "step": 580
     },
     {
       "epoch": 1.147227533460803,
-      "grad_norm": 1.741455078125,
       "learning_rate": 0.0009836413851710218,
-      "loss": 4.3933,
       "step": 600
     },
     {
       "epoch": 1.1854684512428297,
-      "grad_norm": 1.8857481479644775,
       "learning_rate": 0.0009793923943063523,
-      "loss": 4.3249,
       "step": 620
     },
     {
       "epoch": 1.2237093690248566,
-      "grad_norm": 1.8325748443603516,
       "learning_rate": 0.0009751434034416827,
-      "loss": 4.2147,
       "step": 640
     },
     {
       "epoch": 1.2619502868068833,
-      "grad_norm": 1.8758591413497925,
       "learning_rate": 0.000970894412577013,
-      "loss": 4.1569,
       "step": 660
     },
     {
       "epoch": 1.3001912045889101,
-      "grad_norm": 1.899542212486267,
       "learning_rate": 0.0009666454217123433,
-      "loss": 4.131,
       "step": 680
     },
     {
       "epoch": 1.338432122370937,
-      "grad_norm": 1.8188538551330566,
       "learning_rate": 0.0009623964308476737,
-      "loss": 4.0467,
       "step": 700
     },
     {
       "epoch": 1.3766730401529637,
-      "grad_norm": 1.7679705619812012,
       "learning_rate": 0.000958147439983004,
-      "loss": 3.9904,
       "step": 720
     },
     {
       "epoch": 1.4149139579349903,
-      "grad_norm": 1.849482774734497,
       "learning_rate": 0.0009538984491183344,
-      "loss": 3.9464,
       "step": 740
     },
     {
       "epoch": 1.4531548757170172,
-      "grad_norm": 1.8237632513046265,
       "learning_rate": 0.0009496494582536647,
-      "loss": 3.9104,
       "step": 760
     },
     {
       "epoch": 1.491395793499044,
-      "grad_norm": 1.8175936937332153,
       "learning_rate": 0.0009454004673889951,
-      "loss": 3.8441,
       "step": 780
     },
     {
       "epoch": 1.5296367112810707,
-      "grad_norm": 1.7967997789382935,
       "learning_rate": 0.0009411514765243255,
-      "loss": 3.7898,
       "step": 800
     },
     {
       "epoch": 1.5678776290630974,
-      "grad_norm": 1.7681634426116943,
       "learning_rate": 0.0009369024856596558,
-      "loss": 3.6894,
       "step": 820
     },
     {
       "epoch": 1.6061185468451242,
-      "grad_norm": 1.8655925989151,
       "learning_rate": 0.0009326534947949862,
-      "loss": 3.6798,
       "step": 840
     },
     {
       "epoch": 1.644359464627151,
-      "grad_norm": 1.853769302368164,
       "learning_rate": 0.0009284045039303166,
-      "loss": 3.6297,
       "step": 860
     },
     {
       "epoch": 1.682600382409178,
-      "grad_norm": 1.8198288679122925,
       "learning_rate": 0.0009241555130656469,
-      "loss": 3.5592,
       "step": 880
     },
     {
       "epoch": 1.7208413001912046,
-      "grad_norm": 1.7744460105895996,
       "learning_rate": 0.0009199065222009773,
-      "loss": 3.5056,
       "step": 900
     },
     {
       "epoch": 1.7590822179732313,
-      "grad_norm": 1.797914981842041,
       "learning_rate": 0.0009156575313363077,
-      "loss": 3.4635,
       "step": 920
     },
     {
       "epoch": 1.7973231357552581,
-      "grad_norm": 1.8479169607162476,
       "learning_rate": 0.000911408540471638,
-      "loss": 3.4434,
       "step": 940
     },
     {
       "epoch": 1.835564053537285,
-      "grad_norm": 1.818405032157898,
       "learning_rate": 0.0009071595496069684,
-      "loss": 3.441,
       "step": 960
     },
     {
       "epoch": 1.8738049713193117,
-      "grad_norm": 1.7609572410583496,
       "learning_rate": 0.0009029105587422988,
-      "loss": 3.3934,
       "step": 980
     },
     {
       "epoch": 1.9120458891013383,
-      "grad_norm": 1.7228211164474487,
       "learning_rate": 0.0008986615678776291,
-      "loss": 3.2961,
       "step": 1000
     },
     {
       "epoch": 1.9502868068833652,
-      "grad_norm": 1.8148291110992432,
       "learning_rate": 0.0008944125770129595,
-      "loss": 3.2611,
       "step": 1020
     },
     {
       "epoch": 1.988527724665392,
-      "grad_norm": 1.933300495147705,
       "learning_rate": 0.0008901635861482899,
-      "loss": 3.224,
       "step": 1040
     },
     {
       "epoch": 2.0,
-      "eval_accuracy": 0.5140935082408342,
-      "eval_loss": 2.258894205093384,
-      "eval_runtime": 203.9361,
-      "eval_samples_per_second": 72.89,
-      "eval_steps_per_second": 72.89,
       "step": 1046
     },
     {
       "epoch": 2.026768642447419,
-      "grad_norm": 1.819346308708191,
       "learning_rate": 0.0008859145952836202,
-      "loss": 3.1667,
       "step": 1060
     },
     {
       "epoch": 2.0650095602294454,
-      "grad_norm": 1.7024896144866943,
       "learning_rate": 0.0008816656044189504,
-      "loss": 3.0232,
       "step": 1080
     },
     {
       "epoch": 2.1032504780114722,
-      "grad_norm": 1.7023948431015015,
       "learning_rate": 0.000877416613554281,
-      "loss": 3.0489,
       "step": 1100
     },
     {
       "epoch": 2.141491395793499,
-      "grad_norm": 1.804140329360962,
       "learning_rate": 0.0008731676226896112,
-      "loss": 2.9732,
       "step": 1120
     },
     {
       "epoch": 2.179732313575526,
-      "grad_norm": 1.7260992527008057,
       "learning_rate": 0.0008689186318249415,
-      "loss": 2.9562,
       "step": 1140
     },
     {
       "epoch": 2.2179732313575524,
-      "grad_norm": 1.7970356941223145,
       "learning_rate": 0.000864669640960272,
-      "loss": 2.8875,
       "step": 1160
     },
     {
       "epoch": 2.2562141491395793,
-      "grad_norm": 1.8579261302947998,
       "learning_rate": 0.0008604206500956023,
-      "loss": 2.916,
       "step": 1180
     },
     {
       "epoch": 2.294455066921606,
-      "grad_norm": 1.852342128753662,
       "learning_rate": 0.0008561716592309326,
-      "loss": 2.8963,
       "step": 1200
     },
     {
       "epoch": 2.332695984703633,
-      "grad_norm": 1.8845752477645874,
       "learning_rate": 0.000851922668366263,
-      "loss": 2.8029,
       "step": 1220
     },
     {
       "epoch": 2.3709369024856595,
-      "grad_norm": 1.883952260017395,
       "learning_rate": 0.0008476736775015934,
-      "loss": 2.8237,
       "step": 1240
     },
     {
       "epoch": 2.4091778202676863,
-      "grad_norm": 1.8383756875991821,
       "learning_rate": 0.0008434246866369237,
-      "loss": 2.8473,
       "step": 1260
     },
     {
       "epoch": 2.447418738049713,
-      "grad_norm": 1.8900470733642578,
       "learning_rate": 0.0008391756957722541,
-      "loss": 2.7722,
       "step": 1280
     },
     {
       "epoch": 2.48565965583174,
-      "grad_norm": 1.8097845315933228,
       "learning_rate": 0.0008349267049075845,
-      "loss": 2.7584,
       "step": 1300
     },
     {
       "epoch": 2.5239005736137665,
-      "grad_norm": 1.7215895652770996,
       "learning_rate": 0.0008306777140429148,
-      "loss": 2.7134,
       "step": 1320
     },
     {
       "epoch": 2.5621414913957934,
-      "grad_norm": 1.8249051570892334,
       "learning_rate": 0.0008264287231782451,
-      "loss": 2.6531,
       "step": 1340
     },
     {
       "epoch": 2.6003824091778203,
-      "grad_norm": 1.8082237243652344,
       "learning_rate": 0.0008221797323135756,
-      "loss": 2.6675,
       "step": 1360
     },
     {
       "epoch": 2.638623326959847,
-      "grad_norm": 1.7981261014938354,
       "learning_rate": 0.0008179307414489059,
-      "loss": 2.5702,
       "step": 1380
     },
     {
       "epoch": 2.676864244741874,
-      "grad_norm": 1.6964036226272583,
       "learning_rate": 0.0008136817505842362,
-      "loss": 2.6339,
       "step": 1400
     },
     {
       "epoch": 2.7151051625239004,
-      "grad_norm": 1.755050778388977,
       "learning_rate": 0.0008094327597195667,
-      "loss": 2.5489,
       "step": 1420
     },
     {
       "epoch": 2.7533460803059273,
-      "grad_norm": 1.7242581844329834,
       "learning_rate": 0.000805183768854897,
-      "loss": 2.5908,
       "step": 1440
     },
     {
       "epoch": 2.791586998087954,
-      "grad_norm": 1.819612741470337,
       "learning_rate": 0.0008009347779902273,
-      "loss": 2.5143,
       "step": 1460
     },
     {
       "epoch": 2.8298279158699806,
-      "grad_norm": 1.7033363580703735,
       "learning_rate": 0.0007966857871255578,
-      "loss": 2.4662,
       "step": 1480
     },
     {
       "epoch": 2.8680688336520075,
-      "grad_norm": 1.7662159204483032,
       "learning_rate": 0.000792436796260888,
-      "loss": 2.4044,
       "step": 1500
     },
     {
       "epoch": 2.9063097514340344,
-      "grad_norm": 1.7460269927978516,
       "learning_rate": 0.0007881878053962183,
-      "loss": 2.4636,
       "step": 1520
     },
     {
       "epoch": 2.9445506692160612,
-      "grad_norm": 1.8268380165100098,
       "learning_rate": 0.0007839388145315488,
-      "loss": 2.3955,
       "step": 1540
     },
     {
       "epoch": 2.982791586998088,
-      "grad_norm": 1.796981930732727,
       "learning_rate": 0.0007796898236668791,
-      "loss": 2.3964,
       "step": 1560
     },
     {
       "epoch": 3.0,
-      "eval_accuracy": 0.6835519677093844,
-      "eval_loss": 1.4662528038024902,
-      "eval_runtime": 419.2044,
-      "eval_samples_per_second": 35.46,
-      "eval_steps_per_second": 35.46,
       "step": 1569
     },
     {
       "epoch": 3.0210325047801145,
-      "grad_norm": 1.7852272987365723,
       "learning_rate": 0.0007754408328022094,
-      "loss": 2.3174,
       "step": 1580
     },
     {
       "epoch": 3.0592734225621414,
-      "grad_norm": 1.8464534282684326,
       "learning_rate": 0.0007711918419375399,
-      "loss": 2.2913,
       "step": 1600
     },
     {
       "epoch": 3.0975143403441683,
-      "grad_norm": 1.7783145904541016,
       "learning_rate": 0.0007669428510728702,
-      "loss": 2.2856,
       "step": 1620
     },
     {
       "epoch": 3.135755258126195,
-      "grad_norm": 1.744454264640808,
       "learning_rate": 0.0007626938602082005,
-      "loss": 2.2099,
       "step": 1640
     },
     {
       "epoch": 3.173996175908222,
-      "grad_norm": 1.8276797533035278,
       "learning_rate": 0.0007584448693435309,
-      "loss": 2.23,
       "step": 1660
     },
     {
       "epoch": 3.2122370936902485,
-      "grad_norm": 1.8144315481185913,
       "learning_rate": 0.0007541958784788613,
-      "loss": 2.1912,
       "step": 1680
     },
     {
       "epoch": 3.2504780114722753,
-      "grad_norm": 1.8499830961227417,
       "learning_rate": 0.0007499468876141916,
-      "loss": 2.1818,
       "step": 1700
     },
     {
       "epoch": 3.288718929254302,
-      "grad_norm": 1.7623099088668823,
       "learning_rate": 0.000745697896749522,
-      "loss": 2.1349,
       "step": 1720
     },
     {
       "epoch": 3.3269598470363286,
-      "grad_norm": 1.8180640935897827,
       "learning_rate": 0.0007414489058848524,
-      "loss": 2.1055,
       "step": 1740
     },
     {
       "epoch": 3.3652007648183555,
-      "grad_norm": 1.8159964084625244,
       "learning_rate": 0.0007371999150201827,
-      "loss": 2.1077,
       "step": 1760
     },
     {
       "epoch": 3.4034416826003824,
-      "grad_norm": 1.7902129888534546,
       "learning_rate": 0.0007329509241555131,
-      "loss": 2.0999,
       "step": 1780
     },
     {
       "epoch": 3.4416826003824093,
-      "grad_norm": 1.7685898542404175,
       "learning_rate": 0.0007287019332908435,
-      "loss": 2.1188,
       "step": 1800
     },
     {
       "epoch": 3.479923518164436,
-      "grad_norm": 1.758325219154358,
       "learning_rate": 0.0007244529424261738,
-      "loss": 2.0956,
       "step": 1820
     },
     {
       "epoch": 3.5181644359464626,
-      "grad_norm": 1.7802537679672241,
       "learning_rate": 0.0007202039515615042,
-      "loss": 2.0488,
       "step": 1840
     },
     {
       "epoch": 3.5564053537284894,
-      "grad_norm": 1.8220280408859253,
       "learning_rate": 0.0007159549606968346,
-      "loss": 2.0776,
       "step": 1860
     },
     {
       "epoch": 3.5946462715105163,
-      "grad_norm": 1.8494378328323364,
       "learning_rate": 0.0007117059698321649,
-      "loss": 2.0257,
       "step": 1880
     },
     {
       "epoch": 3.632887189292543,
-      "grad_norm": 1.719109296798706,
       "learning_rate": 0.0007074569789674953,
-      "loss": 2.0332,
       "step": 1900
     },
     {
       "epoch": 3.67112810707457,
-      "grad_norm": 1.9517509937286377,
       "learning_rate": 0.0007032079881028257,
-      "loss": 2.0052,
       "step": 1920
     },
     {
       "epoch": 3.7093690248565965,
-      "grad_norm": 1.7318940162658691,
       "learning_rate": 0.0006989589972381559,
-      "loss": 1.9901,
       "step": 1940
     },
     {
       "epoch": 3.7476099426386233,
-      "grad_norm": 1.767015814781189,
       "learning_rate": 0.0006947100063734863,
-      "loss": 1.9411,
       "step": 1960
     },
     {
       "epoch": 3.78585086042065,
-      "grad_norm": 1.761806607246399,
       "learning_rate": 0.0006904610155088166,
-      "loss": 2.0048,
       "step": 1980
     },
     {
       "epoch": 3.8240917782026767,
-      "grad_norm": 1.7126002311706543,
       "learning_rate": 0.000686212024644147,
-      "loss": 1.9312,
       "step": 2000
     },
     {
       "epoch": 3.8623326959847035,
-      "grad_norm": 1.7167437076568604,
       "learning_rate": 0.0006819630337794774,
-      "loss": 1.9443,
       "step": 2020
     },
     {
       "epoch": 3.9005736137667304,
-      "grad_norm": 1.749881386756897,
       "learning_rate": 0.0006777140429148077,
-      "loss": 1.893,
       "step": 2040
     },
     {
       "epoch": 3.9388145315487573,
-      "grad_norm": 1.6846592426300049,
       "learning_rate": 0.0006734650520501381,
-      "loss": 1.8876,
       "step": 2060
     },
     {
       "epoch": 3.977055449330784,
-      "grad_norm": 1.8149057626724243,
       "learning_rate": 0.0006692160611854685,
-      "loss": 1.8474,
       "step": 2080
     },
     {
       "epoch": 4.0,
-      "eval_accuracy": 0.7926673393878237,
-      "eval_loss": 0.9547563195228577,
-      "eval_runtime": 183.3001,
-      "eval_samples_per_second": 81.097,
-      "eval_steps_per_second": 81.097,
       "step": 2092
     },
     {
       "epoch": 4.015296367112811,
-      "grad_norm": 1.6455098390579224,
       "learning_rate": 0.0006649670703207988,
-      "loss": 1.813,
       "step": 2100
     },
     {
       "epoch": 4.053537284894838,
-      "grad_norm": 1.6958200931549072,
       "learning_rate": 0.0006607180794561292,
-      "loss": 1.7354,
       "step": 2120
     },
     {
       "epoch": 4.091778202676864,
-      "grad_norm": 1.7456037998199463,
       "learning_rate": 0.0006564690885914596,
-      "loss": 1.7479,
       "step": 2140
     },
     {
       "epoch": 4.130019120458891,
-      "grad_norm": 1.7887734174728394,
       "learning_rate": 0.0006522200977267899,
-      "loss": 1.7138,
       "step": 2160
     },
     {
       "epoch": 4.168260038240918,
-      "grad_norm": 1.7080284357070923,
       "learning_rate": 0.0006479711068621203,
-      "loss": 1.7023,
       "step": 2180
     },
     {
       "epoch": 4.2065009560229445,
-      "grad_norm": 1.8061983585357666,
       "learning_rate": 0.0006437221159974506,
-      "loss": 1.7526,
       "step": 2200
     },
     {
       "epoch": 4.244741873804971,
-      "grad_norm": 1.7831811904907227,
       "learning_rate": 0.000639473125132781,
-      "loss": 1.7474,
       "step": 2220
     },
     {
       "epoch": 4.282982791586998,
-      "grad_norm": 1.752357840538025,
       "learning_rate": 0.0006352241342681113,
-      "loss": 1.6688,
       "step": 2240
     },
     {
       "epoch": 4.321223709369025,
-      "grad_norm": 1.7843034267425537,
       "learning_rate": 0.0006309751434034417,
-      "loss": 1.7009,
       "step": 2260
     },
     {
       "epoch": 4.359464627151052,
-      "grad_norm": 1.7608367204666138,
       "learning_rate": 0.0006267261525387721,
-      "loss": 1.6727,
       "step": 2280
     },
     {
       "epoch": 4.397705544933078,
-      "grad_norm": 1.6877254247665405,
       "learning_rate": 0.0006224771616741024,
-      "loss": 1.6801,
       "step": 2300
     },
     {
       "epoch": 4.435946462715105,
-      "grad_norm": 1.7891350984573364,
       "learning_rate": 0.0006182281708094328,
-      "loss": 1.7108,
       "step": 2320
     },
     {
       "epoch": 4.474187380497132,
-      "grad_norm": 1.7104123830795288,
       "learning_rate": 0.0006139791799447631,
-      "loss": 1.6442,
       "step": 2340
     },
     {
       "epoch": 4.512428298279159,
-      "grad_norm": 1.7026969194412231,
       "learning_rate": 0.0006097301890800934,
-      "loss": 1.6531,
       "step": 2360
     },
     {
       "epoch": 4.550669216061186,
-      "grad_norm": 1.7890552282333374,
       "learning_rate": 0.0006054811982154238,
-      "loss": 1.6539,
       "step": 2380
     },
     {
       "epoch": 4.588910133843212,
-      "grad_norm": 1.8423861265182495,
       "learning_rate": 0.0006012322073507542,
-      "loss": 1.6681,
       "step": 2400
     },
     {
       "epoch": 4.627151051625239,
-      "grad_norm": 1.6434499025344849,
       "learning_rate": 0.0005969832164860845,
-      "loss": 1.5935,
       "step": 2420
     },
     {
       "epoch": 4.665391969407266,
-      "grad_norm": 1.7261130809783936,
       "learning_rate": 0.0005927342256214149,
-      "loss": 1.6273,
       "step": 2440
     },
     {
       "epoch": 4.7036328871892925,
-      "grad_norm": 1.7288273572921753,
       "learning_rate": 0.0005884852347567453,
-      "loss": 1.6181,
       "step": 2460
     },
     {
       "epoch": 4.741873804971319,
-      "grad_norm": 1.773258924484253,
       "learning_rate": 0.0005842362438920756,
-      "loss": 1.5719,
       "step": 2480
     },
     {
       "epoch": 4.780114722753346,
-      "grad_norm": 1.7676658630371094,
       "learning_rate": 0.000579987253027406,
-      "loss": 1.578,
       "step": 2500
     },
     {
       "epoch": 4.818355640535373,
-      "grad_norm": 1.8115794658660889,
       "learning_rate": 0.0005757382621627364,
-      "loss": 1.535,
       "step": 2520
     },
     {
       "epoch": 4.8565965583174,
-      "grad_norm": 1.7989414930343628,
       "learning_rate": 0.0005714892712980667,
-      "loss": 1.5493,
       "step": 2540
     },
     {
       "epoch": 4.894837476099426,
-      "grad_norm": 1.6607849597930908,
       "learning_rate": 0.000567240280433397,
-      "loss": 1.5489,
       "step": 2560
     },
     {
       "epoch": 4.933078393881453,
-      "grad_norm": 1.630257487297058,
       "learning_rate": 0.0005629912895687275,
-      "loss": 1.5091,
       "step": 2580
     },
     {
       "epoch": 4.97131931166348,
-      "grad_norm": 1.7995944023132324,
       "learning_rate": 0.0005587422987040578,
-      "loss": 1.5275,
       "step": 2600
     },
     {
       "epoch": 5.0,
-      "eval_accuracy": 0.8571140262361251,
-      "eval_loss": 0.6697778105735779,
-      "eval_runtime": 181.3784,
-      "eval_samples_per_second": 81.956,
-      "eval_steps_per_second": 81.956,
       "step": 2615
     },
     {
       "epoch": 5.009560229445507,
-      "grad_norm": 1.7868553400039673,
       "learning_rate": 0.0005544933078393881,
-      "loss": 1.4774,
       "step": 2620
     },
     {
       "epoch": 5.047801147227533,
-      "grad_norm": 1.6380654573440552,
       "learning_rate": 0.0005502443169747186,
-      "loss": 1.3955,
       "step": 2640
     },
     {
       "epoch": 5.08604206500956,
-      "grad_norm": 1.7844533920288086,
       "learning_rate": 0.0005459953261100489,
-      "loss": 1.4414,
       "step": 2660
     },
     {
       "epoch": 5.124282982791587,
-      "grad_norm": 1.779080867767334,
       "learning_rate": 0.0005417463352453792,
-      "loss": 1.3782,
       "step": 2680
     },
     {
       "epoch": 5.162523900573614,
-      "grad_norm": 1.741326928138733,
       "learning_rate": 0.0005374973443807097,
-      "loss": 1.4152,
       "step": 2700
     },
     {
       "epoch": 5.2007648183556405,
-      "grad_norm": 1.7447401285171509,
       "learning_rate": 0.00053324835351604,
-      "loss": 1.3996,
       "step": 2720
     },
     {
       "epoch": 5.239005736137667,
-      "grad_norm": 1.8067736625671387,
       "learning_rate": 0.0005289993626513702,
-      "loss": 1.4137,
       "step": 2740
     },
     {
       "epoch": 5.277246653919694,
-      "grad_norm": 1.7393046617507935,
       "learning_rate": 0.0005247503717867008,
-      "loss": 1.3937,
       "step": 2760
     },
     {
       "epoch": 5.315487571701721,
-      "grad_norm": 1.756184458732605,
       "learning_rate": 0.000520501380922031,
-      "loss": 1.3912,
       "step": 2780
     },
     {
       "epoch": 5.353728489483748,
-      "grad_norm": 1.7133733034133911,
       "learning_rate": 0.0005162523900573613,
-      "loss": 1.387,
       "step": 2800
     },
     {
       "epoch": 5.3919694072657744,
-      "grad_norm": 1.6597713232040405,
       "learning_rate": 0.0005120033991926918,
-      "loss": 1.3551,
       "step": 2820
     },
     {
       "epoch": 5.430210325047801,
-      "grad_norm": 1.8462845087051392,
       "learning_rate": 0.0005077544083280221,
-      "loss": 1.3557,
       "step": 2840
     },
     {
       "epoch": 5.468451242829828,
-      "grad_norm": 1.6737143993377686,
       "learning_rate": 0.0005035054174633524,
-      "loss": 1.3495,
       "step": 2860
     },
     {
       "epoch": 5.506692160611855,
-      "grad_norm": 1.7071157693862915,
       "learning_rate": 0.0004992564265986828,
-      "loss": 1.394,
       "step": 2880
     },
     {
       "epoch": 5.544933078393882,
-      "grad_norm": 1.663072943687439,
       "learning_rate": 0.0004950074357340132,
-      "loss": 1.3263,
       "step": 2900
     },
     {
       "epoch": 5.583173996175908,
-      "grad_norm": 1.640093207359314,
       "learning_rate": 0.0004907584448693436,
-      "loss": 1.3474,
       "step": 2920
     },
     {
       "epoch": 5.621414913957935,
-      "grad_norm": 1.762568712234497,
       "learning_rate": 0.0004865094540046739,
-      "loss": 1.3375,
       "step": 2940
     },
     {
       "epoch": 5.659655831739962,
-      "grad_norm": 1.6714434623718262,
       "learning_rate": 0.00048226046314000425,
-      "loss": 1.3218,
       "step": 2960
     },
     {
       "epoch": 5.6978967495219885,
-      "grad_norm": 1.7594107389450073,
       "learning_rate": 0.0004780114722753346,
-      "loss": 1.3008,
       "step": 2980
     },
     {
       "epoch": 5.736137667304015,
-      "grad_norm": 1.6483973264694214,
       "learning_rate": 0.000473762481410665,
-      "loss": 1.3331,
       "step": 3000
     },
     {
       "epoch": 5.774378585086042,
-      "grad_norm": 1.7252651453018188,
       "learning_rate": 0.00046951349054599533,
-      "loss": 1.2775,
       "step": 3020
     },
     {
       "epoch": 5.812619502868069,
-      "grad_norm": 1.7860745191574097,
       "learning_rate": 0.0004652644996813257,
-      "loss": 1.2747,
       "step": 3040
     },
     {
       "epoch": 5.850860420650095,
-      "grad_norm": 1.749874234199524,
       "learning_rate": 0.0004610155088166561,
-      "loss": 1.2946,
       "step": 3060
     },
     {
       "epoch": 5.8891013384321225,
-      "grad_norm": 1.7197644710540771,
       "learning_rate": 0.0004567665179519864,
-      "loss": 1.2849,
       "step": 3080
     },
     {
       "epoch": 5.927342256214149,
-      "grad_norm": 1.6396132707595825,
       "learning_rate": 0.00045251752708731676,
-      "loss": 1.2544,
       "step": 3100
     },
     {
       "epoch": 5.965583173996176,
-      "grad_norm": 1.720376968383789,
       "learning_rate": 0.0004482685362226471,
-      "loss": 1.248,
       "step": 3120
     },
     {
       "epoch": 6.0,
-      "eval_accuracy": 0.8899428187016482,
-      "eval_loss": 0.5270123481750488,
-      "eval_runtime": 183.7621,
-      "eval_samples_per_second": 80.893,
-      "eval_steps_per_second": 80.893,
       "step": 3138
     },
     {
       "epoch": 6.003824091778203,
-      "grad_norm": 1.5206599235534668,
       "learning_rate": 0.0004440195453579775,
-      "loss": 1.2398,
       "step": 3140
     },
     {
       "epoch": 6.042065009560229,
-      "grad_norm": 1.7172082662582397,
       "learning_rate": 0.00043977055449330785,
-      "loss": 1.2097,
       "step": 3160
     },
     {
       "epoch": 6.080305927342256,
-      "grad_norm": 1.5570909976959229,
       "learning_rate": 0.0004355215636286382,
-      "loss": 1.1669,
       "step": 3180
     },
     {
       "epoch": 6.118546845124283,
-      "grad_norm": 1.7044614553451538,
       "learning_rate": 0.0004312725727639686,
-      "loss": 1.1647,
       "step": 3200
     },
     {
       "epoch": 6.15678776290631,
-      "grad_norm": 1.5819571018218994,
       "learning_rate": 0.0004270235818992989,
-      "loss": 1.1627,
       "step": 3220
     },
     {
       "epoch": 6.195028680688337,
-      "grad_norm": 1.7076871395111084,
       "learning_rate": 0.0004227745910346293,
-      "loss": 1.1728,
       "step": 3240
     },
     {
       "epoch": 6.233269598470363,
-      "grad_norm": 1.7301490306854248,
       "learning_rate": 0.0004185256001699597,
-      "loss": 1.1459,
       "step": 3260
     },
     {
       "epoch": 6.27151051625239,
-      "grad_norm": 1.7135626077651978,
       "learning_rate": 0.00041427660930528997,
-      "loss": 1.1676,
       "step": 3280
     },
     {
       "epoch": 6.309751434034417,
-      "grad_norm": 1.602142572402954,
       "learning_rate": 0.00041002761844062037,
-      "loss": 1.1488,
       "step": 3300
     },
     {
       "epoch": 6.347992351816444,
-      "grad_norm": 1.755293846130371,
       "learning_rate": 0.00040577862757595076,
-      "loss": 1.1395,
       "step": 3320
     },
     {
       "epoch": 6.3862332695984705,
-      "grad_norm": 1.663662314414978,
       "learning_rate": 0.00040152963671128105,
-      "loss": 1.1321,
       "step": 3340
     },
     {
       "epoch": 6.424474187380497,
-      "grad_norm": 1.7366993427276611,
       "learning_rate": 0.00039728064584661145,
-      "loss": 1.1317,
       "step": 3360
     },
     {
       "epoch": 6.462715105162524,
-      "grad_norm": 1.7560149431228638,
       "learning_rate": 0.0003930316549819418,
-      "loss": 1.1449,
       "step": 3380
     },
     {
       "epoch": 6.500956022944551,
-      "grad_norm": 1.7576582431793213,
       "learning_rate": 0.00038878266411727214,
-      "loss": 1.13,
       "step": 3400
     },
     {
       "epoch": 6.539196940726577,
-      "grad_norm": 1.7916873693466187,
       "learning_rate": 0.00038453367325260254,
-      "loss": 1.1419,
       "step": 3420
     },
     {
       "epoch": 6.577437858508604,
-      "grad_norm": 1.5987508296966553,
       "learning_rate": 0.0003802846823879329,
-      "loss": 1.1107,
       "step": 3440
     },
     {
       "epoch": 6.615678776290631,
-      "grad_norm": 1.8192518949508667,
       "learning_rate": 0.0003760356915232632,
-      "loss": 1.1162,
       "step": 3460
     },
     {
       "epoch": 6.653919694072657,
-      "grad_norm": 1.7236486673355103,
       "learning_rate": 0.0003717867006585936,
-      "loss": 1.1255,
       "step": 3480
     },
     {
       "epoch": 6.692160611854685,
-      "grad_norm": 1.8209389448165894,
       "learning_rate": 0.0003675377097939239,
-      "loss": 1.0629,
       "step": 3500
     },
     {
       "epoch": 6.730401529636711,
-      "grad_norm": 1.652782678604126,
       "learning_rate": 0.0003632887189292543,
-      "loss": 1.0809,
       "step": 3520
     },
     {
       "epoch": 6.768642447418738,
-      "grad_norm": 1.6148645877838135,
       "learning_rate": 0.00035903972806458466,
-      "loss": 1.1286,
       "step": 3540
     },
     {
       "epoch": 6.806883365200765,
-      "grad_norm": 1.6869423389434814,
       "learning_rate": 0.000354790737199915,
-      "loss": 1.1069,
       "step": 3560
     },
     {
       "epoch": 6.845124282982791,
-      "grad_norm": 1.6373172998428345,
       "learning_rate": 0.0003505417463352454,
-      "loss": 1.0911,
       "step": 3580
     },
     {
       "epoch": 6.8833652007648185,
-      "grad_norm": 1.6761549711227417,
       "learning_rate": 0.00034629275547057574,
-      "loss": 1.0808,
       "step": 3600
     },
     {
       "epoch": 6.921606118546845,
-      "grad_norm": 1.6510460376739502,
       "learning_rate": 0.0003420437646059061,
-      "loss": 1.0809,
       "step": 3620
     },
     {
       "epoch": 6.959847036328872,
-      "grad_norm": 1.7351855039596558,
       "learning_rate": 0.0003377947737412365,
       "loss": 1.0912,
       "step": 3640
     },
     {
       "epoch": 6.998087954110899,
-      "grad_norm": 1.7165274620056152,
       "learning_rate": 0.00033354578287656683,
-      "loss": 1.0991,
       "step": 3660
     },
     {
       "epoch": 7.0,
-      "eval_accuracy": 0.9037336024217961,
-      "eval_loss": 0.44995447993278503,
-      "eval_runtime": 189.5634,
-      "eval_samples_per_second": 78.417,
-      "eval_steps_per_second": 78.417,
       "step": 3661
     },
     {
       "epoch": 7.036328871892925,
-      "grad_norm": 1.6468501091003418,
       "learning_rate": 0.0003292967920118972,
-      "loss": 1.0154,
       "step": 3680
     },
     {
       "epoch": 7.074569789674952,
-      "grad_norm": 1.79421067237854,
       "learning_rate": 0.0003250478011472275,
-      "loss": 1.0378,
       "step": 3700
     },
     {
       "epoch": 7.112810707456979,
-      "grad_norm": 1.7234885692596436,
       "learning_rate": 0.0003207988102825579,
-      "loss": 1.0145,
       "step": 3720
     },
     {
       "epoch": 7.151051625239006,
-      "grad_norm": 1.6947157382965088,
       "learning_rate": 0.00031654981941788826,
-      "loss": 1.0012,
       "step": 3740
     },
     {
       "epoch": 7.189292543021033,
-      "grad_norm": 1.6818758249282837,
       "learning_rate": 0.0003123008285532186,
-      "loss": 1.0191,
       "step": 3760
     },
     {
       "epoch": 7.227533460803059,
-      "grad_norm": 1.557080864906311,
       "learning_rate": 0.000308051837688549,
-      "loss": 1.0437,
       "step": 3780
     },
     {
       "epoch": 7.265774378585086,
-      "grad_norm": 1.6532793045043945,
       "learning_rate": 0.00030380284682387935,
-      "loss": 1.0079,
       "step": 3800
     },
     {
       "epoch": 7.304015296367113,
-      "grad_norm": 1.646686315536499,
       "learning_rate": 0.0002995538559592097,
-      "loss": 0.9904,
       "step": 3820
     },
     {
       "epoch": 7.342256214149139,
-      "grad_norm": 1.6772829294204712,
       "learning_rate": 0.0002953048650945401,
-      "loss": 1.0002,
       "step": 3840
     },
     {
       "epoch": 7.3804971319311665,
-      "grad_norm": 1.6452054977416992,
       "learning_rate": 0.0002910558742298704,
-      "loss": 0.9674,
       "step": 3860
     },
     {
       "epoch": 7.418738049713193,
-      "grad_norm": 1.592207908630371,
       "learning_rate": 0.0002868068833652008,
-      "loss": 0.9642,
       "step": 3880
     },
     {
       "epoch": 7.45697896749522,
-      "grad_norm": 1.7015941143035889,
       "learning_rate": 0.0002825578925005312,
-      "loss": 0.973,
       "step": 3900
     },
     {
       "epoch": 7.495219885277247,
-      "grad_norm": 1.6589232683181763,
       "learning_rate": 0.00027830890163586146,
-      "loss": 0.9803,
       "step": 3920
     },
     {
       "epoch": 7.533460803059273,
-      "grad_norm": 1.660190463066101,
       "learning_rate": 0.00027405991077119186,
-      "loss": 0.9702,
       "step": 3940
     },
     {
       "epoch": 7.5717017208413,
-      "grad_norm": 1.7052509784698486,
       "learning_rate": 0.00026981091990652226,
-      "loss": 0.9919,
       "step": 3960
     },
     {
       "epoch": 7.609942638623327,
-      "grad_norm": 1.6874445676803589,
       "learning_rate": 0.00026556192904185255,
-      "loss": 0.938,
       "step": 3980
     },
     {
       "epoch": 7.648183556405353,
-      "grad_norm": 1.811640739440918,
       "learning_rate": 0.00026131293817718295,
-      "loss": 0.9564,
       "step": 4000
     },
     {
       "epoch": 7.686424474187381,
-      "grad_norm": 1.741968035697937,
       "learning_rate": 0.00025706394731251324,
-      "loss": 0.9432,
       "step": 4020
     },
     {
       "epoch": 7.724665391969407,
-      "grad_norm": 1.6731518507003784,
       "learning_rate": 0.00025281495644784364,
-      "loss": 0.9082,
       "step": 4040
     },
     {
       "epoch": 7.762906309751434,
-      "grad_norm": 1.7399870157241821,
       "learning_rate": 0.00024856596558317403,
-      "loss": 0.92,
       "step": 4060
     },
     {
       "epoch": 7.801147227533461,
-      "grad_norm": 1.580674171447754,
       "learning_rate": 0.0002443169747185044,
-      "loss": 0.9433,
       "step": 4080
     },
     {
       "epoch": 7.839388145315487,
-      "grad_norm": 1.5683550834655762,
       "learning_rate": 0.00024006798385383472,
-      "loss": 0.9584,
       "step": 4100
     },
     {
       "epoch": 7.8776290630975145,
-      "grad_norm": 1.7664682865142822,
       "learning_rate": 0.00023581899298916507,
-      "loss": 0.9318,
       "step": 4120
     },
     {
       "epoch": 7.915869980879541,
-      "grad_norm": 1.5522887706756592,
       "learning_rate": 0.00023157000212449544,
-      "loss": 0.8852,
       "step": 4140
     },
     {
       "epoch": 7.954110898661568,
-      "grad_norm": 1.626836895942688,
       "learning_rate": 0.00022732101125982578,
-      "loss": 0.9121,
       "step": 4160
     },
     {
       "epoch": 7.992351816443595,
-      "grad_norm": 1.8042898178100586,
       "learning_rate": 0.00022307202039515615,
-      "loss": 0.9221,
       "step": 4180
     },
     {
       "epoch": 8.0,
-      "eval_accuracy": 0.9266733938782374,
-      "eval_loss": 0.3572401702404022,
-      "eval_runtime": 168.0336,
-      "eval_samples_per_second": 88.464,
-      "eval_steps_per_second": 88.464,
       "step": 4184
     },
     {
       "epoch": 8.030592734225621,
-      "grad_norm": 1.545024037361145,
       "learning_rate": 0.00021882302953048652,
-      "loss": 0.8979,
       "step": 4200
     },
     {
       "epoch": 8.068833652007648,
-      "grad_norm": 1.592607855796814,
       "learning_rate": 0.00021457403866581687,
-      "loss": 0.8464,
       "step": 4220
     },
     {
       "epoch": 8.107074569789676,
-      "grad_norm": 1.5347646474838257,
       "learning_rate": 0.0002103250478011472,
-      "loss": 0.8634,
       "step": 4240
     },
     {
       "epoch": 8.145315487571702,
-      "grad_norm": 1.621201515197754,
       "learning_rate": 0.0002060760569364776,
-      "loss": 0.8425,
       "step": 4260
     },
     {
       "epoch": 8.183556405353729,
-      "grad_norm": 1.7381062507629395,
       "learning_rate": 0.00020182706607180795,
-      "loss": 0.8776,
       "step": 4280
     },
     {
       "epoch": 8.221797323135755,
-      "grad_norm": 1.5798373222351074,
       "learning_rate": 0.0001975780752071383,
-      "loss": 0.854,
       "step": 4300
     },
     {
       "epoch": 8.260038240917781,
-      "grad_norm": 1.5751338005065918,
       "learning_rate": 0.00019332908434246867,
-      "loss": 0.8646,
       "step": 4320
     },
     {
       "epoch": 8.29827915869981,
-      "grad_norm": 1.570742130279541,
       "learning_rate": 0.00018908009347779904,
-      "loss": 0.8521,
       "step": 4340
     },
     {
       "epoch": 8.336520076481836,
-      "grad_norm": 1.7959846258163452,
       "learning_rate": 0.00018483110261312938,
-      "loss": 0.8858,
       "step": 4360
     },
     {
       "epoch": 8.374760994263863,
-      "grad_norm": 1.7537908554077148,
       "learning_rate": 0.00018058211174845973,
-      "loss": 0.8707,
       "step": 4380
     },
     {
       "epoch": 8.413001912045889,
-      "grad_norm": 1.635578989982605,
       "learning_rate": 0.0001763331208837901,
-      "loss": 0.8375,
       "step": 4400
     },
     {
       "epoch": 8.451242829827915,
-      "grad_norm": 1.5729222297668457,
       "learning_rate": 0.00017208413001912047,
-      "loss": 0.8631,
       "step": 4420
     },
     {
       "epoch": 8.489483747609942,
-      "grad_norm": 1.6586476564407349,
       "learning_rate": 0.00016783513915445082,
-      "loss": 0.8551,
       "step": 4440
     },
     {
       "epoch": 8.52772466539197,
-      "grad_norm": 1.6118619441986084,
       "learning_rate": 0.00016358614828978119,
-      "loss": 0.8637,
       "step": 4460
     },
     {
       "epoch": 8.565965583173996,
-      "grad_norm": 1.5538595914840698,
       "learning_rate": 0.00015933715742511153,
-      "loss": 0.8484,
       "step": 4480
     },
     {
       "epoch": 8.604206500956023,
-      "grad_norm": 1.5646642446517944,
       "learning_rate": 0.0001550881665604419,
-      "loss": 0.8433,
       "step": 4500
     },
     {
       "epoch": 8.64244741873805,
-      "grad_norm": 1.7190415859222412,
       "learning_rate": 0.00015083917569577227,
-      "loss": 0.8592,
       "step": 4520
     },
     {
       "epoch": 8.680688336520076,
-      "grad_norm": 1.4950307607650757,
       "learning_rate": 0.00014659018483110262,
-      "loss": 0.8236,
       "step": 4540
     },
     {
       "epoch": 8.718929254302104,
-      "grad_norm": 1.5117732286453247,
       "learning_rate": 0.00014234119396643296,
-      "loss": 0.8421,
       "step": 4560
     },
     {
       "epoch": 8.75717017208413,
-      "grad_norm": 1.5558750629425049,
       "learning_rate": 0.00013809220310176336,
-      "loss": 0.8287,
       "step": 4580
     },
     {
       "epoch": 8.795411089866157,
-      "grad_norm": 1.7955564260482788,
       "learning_rate": 0.0001338432122370937,
-      "loss": 0.8492,
       "step": 4600
     },
     {
       "epoch": 8.833652007648183,
-      "grad_norm": 1.6532599925994873,
       "learning_rate": 0.00012959422137242405,
-      "loss": 0.8419,
       "step": 4620
     },
     {
       "epoch": 8.87189292543021,
-      "grad_norm": 1.7040739059448242,
       "learning_rate": 0.0001253452305077544,
-      "loss": 0.8125,
       "step": 4640
     },
     {
       "epoch": 8.910133843212238,
-      "grad_norm": 1.7040703296661377,
       "learning_rate": 0.00012109623964308478,
-      "loss": 0.8187,
       "step": 4660
     },
     {
       "epoch": 8.948374760994264,
-      "grad_norm": 1.7090845108032227,
       "learning_rate": 0.00011684724877841513,
-      "loss": 0.8155,
       "step": 4680
     },
     {
       "epoch": 8.98661567877629,
-      "grad_norm": 1.6070616245269775,
       "learning_rate": 0.00011259825791374549,
-      "loss": 0.7997,
       "step": 4700
     },
     {
       "epoch": 9.0,
       "eval_accuracy": 0.9352842246888665,
-      "eval_loss": 0.3138497769832611,
-      "eval_runtime": 177.561,
-      "eval_samples_per_second": 83.718,
-      "eval_steps_per_second": 83.718,
       "step": 4707
     },
     {
       "epoch": 9.024856596558317,
-      "grad_norm": 1.5590825080871582,
       "learning_rate": 0.00010834926704907585,
-      "loss": 0.7906,
       "step": 4720
     },
     {
       "epoch": 9.063097514340344,
-      "grad_norm": 1.4745252132415771,
       "learning_rate": 0.0001041002761844062,
-      "loss": 0.7494,
       "step": 4740
     },
     {
       "epoch": 9.101338432122372,
-      "grad_norm": 1.61099112033844,
       "learning_rate": 9.985128531973658e-05,
-      "loss": 0.7854,
       "step": 4760
     },
     {
       "epoch": 9.139579349904398,
-      "grad_norm": 1.5839650630950928,
       "learning_rate": 9.560229445506692e-05,
-      "loss": 0.7636,
       "step": 4780
     },
     {
       "epoch": 9.177820267686425,
-      "grad_norm": 1.7259138822555542,
       "learning_rate": 9.135330359039729e-05,
-      "loss": 0.776,
       "step": 4800
     },
     {
       "epoch": 9.216061185468451,
-      "grad_norm": 1.5495970249176025,
       "learning_rate": 8.710431272572764e-05,
-      "loss": 0.7444,
       "step": 4820
     },
     {
       "epoch": 9.254302103250478,
-      "grad_norm": 1.5250838994979858,
       "learning_rate": 8.2855321861058e-05,
-      "loss": 0.7603,
       "step": 4840
     },
     {
       "epoch": 9.292543021032504,
-      "grad_norm": 1.6244220733642578,
       "learning_rate": 7.860633099638836e-05,
-      "loss": 0.7561,
       "step": 4860
     },
     {
       "epoch": 9.330783938814532,
-      "grad_norm": 1.6825993061065674,
       "learning_rate": 7.435734013171871e-05,
-      "loss": 0.7908,
       "step": 4880
     },
     {
       "epoch": 9.369024856596559,
-      "grad_norm": 1.563707947731018,
       "learning_rate": 7.010834926704908e-05,
-      "loss": 0.7517,
       "step": 4900
     },
     {
       "epoch": 9.407265774378585,
-      "grad_norm": 1.7463629245758057,
       "learning_rate": 6.585935840237942e-05,
-      "loss": 0.7679,
       "step": 4920
     },
     {
       "epoch": 9.445506692160611,
-      "grad_norm": 1.5689053535461426,
       "learning_rate": 6.16103675377098e-05,
-      "loss": 0.7426,
       "step": 4940
     },
     {
       "epoch": 9.483747609942638,
-      "grad_norm": 1.6512914896011353,
       "learning_rate": 5.736137667304015e-05,
-      "loss": 0.7695,
       "step": 4960
     },
     {
       "epoch": 9.521988527724666,
-      "grad_norm": 1.6542084217071533,
       "learning_rate": 5.311238580837052e-05,
-      "loss": 0.7603,
       "step": 4980
     },
     {
       "epoch": 9.560229445506693,
-      "grad_norm": 1.6929945945739746,
       "learning_rate": 4.8863394943700874e-05,
-      "loss": 0.754,
       "step": 5000
     },
     {
       "epoch": 9.598470363288719,
-      "grad_norm": 1.4880517721176147,
       "learning_rate": 4.461440407903123e-05,
-      "loss": 0.7597,
       "step": 5020
     },
     {
       "epoch": 9.636711281070745,
-      "grad_norm": 1.578971266746521,
       "learning_rate": 4.036541321436159e-05,
-      "loss": 0.7624,
       "step": 5040
     },
     {
       "epoch": 9.674952198852772,
-      "grad_norm": 1.616727352142334,
       "learning_rate": 3.6116422349691954e-05,
-      "loss": 0.7653,
       "step": 5060
     },
     {
       "epoch": 9.7131931166348,
-      "grad_norm": 1.6762784719467163,
       "learning_rate": 3.186743148502231e-05,
-      "loss": 0.7352,
       "step": 5080
     },
     {
       "epoch": 9.751434034416826,
-      "grad_norm": 1.5666388273239136,
       "learning_rate": 2.7618440620352666e-05,
-      "loss": 0.732,
       "step": 5100
     },
     {
       "epoch": 9.789674952198853,
-      "grad_norm": 1.641012191772461,
       "learning_rate": 2.3369449755683023e-05,
-      "loss": 0.7631,
       "step": 5120
     },
     {
       "epoch": 9.82791586998088,
-      "grad_norm": 1.7024327516555786,
       "learning_rate": 1.9120458891013384e-05,
-      "loss": 0.7153,
       "step": 5140
     },
     {
       "epoch": 9.866156787762906,
-      "grad_norm": 1.4840829372406006,
       "learning_rate": 1.4871468026343743e-05,
-      "loss": 0.7247,
       "step": 5160
     },
     {
       "epoch": 9.904397705544934,
-      "grad_norm": 1.627562165260315,
       "learning_rate": 1.0622477161674103e-05,
-      "loss": 0.7289,
       "step": 5180
     },
     {
       "epoch": 9.94263862332696,
-      "grad_norm": 1.6473079919815063,
       "learning_rate": 6.373486297004461e-06,
-      "loss": 0.7563,
       "step": 5200
     },
     {
       "epoch": 9.980879541108987,
-      "grad_norm": 1.577776312828064,
       "learning_rate": 2.1244954323348204e-06,
-      "loss": 0.7603,
       "step": 5220
     },
     {
       "epoch": 10.0,
-      "eval_accuracy": 0.9410023545240498,
-      "eval_loss": 0.29460111260414124,
-      "eval_runtime": 187.2975,
-      "eval_samples_per_second": 79.366,
-      "eval_steps_per_second": 79.366,
       "step": 5230
     },
     {
       "epoch": 10.0,
       "step": 5230,
       "total_flos": 1.96318398191328e+18,
-      "train_loss": 2.1099621225725396,
-      "train_runtime": 19361.582,
-      "train_samples_per_second": 69.094,
-      "train_steps_per_second": 0.27
     }
   ],
   "logging_steps": 20,

 {
+  "best_metric": 0.9405314497140935,
   "best_model_checkpoint": "/mnt/data4_HDD_14TB/yang/voxceleb-checkpoints/xvector/voxceleb1/finetune/ce-len3-bs256-lr1e-3/checkpoint-5230",
   "epoch": 10.0,
   "eval_steps": 500,
   "log_history": [
     {
       "epoch": 0.03824091778202677,
+      "grad_norm": 4.702692031860352,
       "learning_rate": 3.824091778202677e-05,
+      "loss": 7.1455,
       "step": 20
     },
     {
       "epoch": 0.07648183556405354,
+      "grad_norm": 4.504467487335205,
       "learning_rate": 7.648183556405354e-05,
+      "loss": 7.116,
       "step": 40
     },
     {
       "epoch": 0.1147227533460803,
+      "grad_norm": 3.964728832244873,
       "learning_rate": 0.0001147227533460803,
+      "loss": 7.0566,
       "step": 60
     },
     {
       "epoch": 0.15296367112810708,
+      "grad_norm": 3.199570417404175,
       "learning_rate": 0.00015296367112810707,
+      "loss": 6.972,
       "step": 80
     },
     {
       "epoch": 0.19120458891013384,
+      "grad_norm": 2.6367344856262207,
       "learning_rate": 0.00019120458891013384,
+      "loss": 6.8778,
       "step": 100
     },
     {
       "epoch": 0.2294455066921606,
+      "grad_norm": 2.1930582523345947,
       "learning_rate": 0.0002294455066921606,
+      "loss": 6.733,
       "step": 120
     },
     {
       "epoch": 0.2676864244741874,
+      "grad_norm": 1.9982482194900513,
       "learning_rate": 0.0002676864244741874,
+      "loss": 6.5814,
       "step": 140
     },
     {
       "epoch": 0.30592734225621415,
+      "grad_norm": 1.8051823377609253,
       "learning_rate": 0.00030592734225621415,
+      "loss": 6.4206,
       "step": 160
     },
     {
       "epoch": 0.3441682600382409,
+      "grad_norm": 1.757859706878662,
       "learning_rate": 0.00034416826003824094,
+      "loss": 6.2424,
       "step": 180
     },
     {
       "epoch": 0.3824091778202677,
+      "grad_norm": 1.7747690677642822,
       "learning_rate": 0.0003824091778202677,
+      "loss": 6.0657,
       "step": 200
     },
     {
       "epoch": 0.42065009560229444,
+      "grad_norm": 1.7719988822937012,
       "learning_rate": 0.0004206500956022944,
+      "loss": 5.9722,
       "step": 220
     },
     {
       "epoch": 0.4588910133843212,
+      "grad_norm": 1.934911847114563,
       "learning_rate": 0.0004588910133843212,
+      "loss": 5.814,
       "step": 240
     },
     {
       "epoch": 0.497131931166348,
+      "grad_norm": 1.793244481086731,
       "learning_rate": 0.0004971319311663481,
+      "loss": 5.6998,
       "step": 260
     },
     {
       "epoch": 0.5353728489483748,
+      "grad_norm": 1.7790417671203613,
       "learning_rate": 0.0005353728489483748,
+      "loss": 5.6115,
       "step": 280
     },
     {
       "epoch": 0.5736137667304015,
+      "grad_norm": 1.8355106115341187,
       "learning_rate": 0.0005736137667304016,
+      "loss": 5.5236,
       "step": 300
     },
     {
       "epoch": 0.6118546845124283,
+      "grad_norm": 1.7114174365997314,
       "learning_rate": 0.0006118546845124283,
+      "loss": 5.3894,
       "step": 320
     },
     {
       "epoch": 0.6500956022944551,
+      "grad_norm": 1.877690315246582,
       "learning_rate": 0.000650095602294455,
+      "loss": 5.3336,
       "step": 340
     },
     {
       "epoch": 0.6883365200764818,
+      "grad_norm": 1.8771674633026123,
       "learning_rate": 0.0006883365200764819,
+      "loss": 5.2577,
       "step": 360
     },
     {
       "epoch": 0.7265774378585086,
+      "grad_norm": 1.9654275178909302,
       "learning_rate": 0.0007265774378585086,
+      "loss": 5.1333,
       "step": 380
     },
     {
       "epoch": 0.7648183556405354,
+      "grad_norm": 1.9327517747879028,
       "learning_rate": 0.0007648183556405354,
+      "loss": 5.0882,
       "step": 400
     },
     {
       "epoch": 0.8030592734225621,
+      "grad_norm": 1.8918468952178955,
       "learning_rate": 0.0008030592734225621,
+      "loss": 5.0404,
       "step": 420
     },
     {
       "epoch": 0.8413001912045889,
+      "grad_norm": 1.8536239862442017,
       "learning_rate": 0.0008413001912045888,
+      "loss": 4.9524,
       "step": 440
     },
     {
       "epoch": 0.8795411089866156,
+      "grad_norm": 1.8849778175354004,
       "learning_rate": 0.0008795411089866157,
+      "loss": 4.8673,
       "step": 460
     },
     {
       "epoch": 0.9177820267686424,
+      "grad_norm": 1.8270999193191528,
       "learning_rate": 0.0009177820267686424,
+      "loss": 4.8456,
       "step": 480
     },
     {
       "epoch": 0.9560229445506692,
+      "grad_norm": 1.8521556854248047,
       "learning_rate": 0.0009560229445506692,
+      "loss": 4.7748,
       "step": 500
     },
     {
       "epoch": 0.994263862332696,
+      "grad_norm": 1.9331077337265015,
       "learning_rate": 0.0009942638623326961,
+      "loss": 4.6869,
       "step": 520
     },
     {
       "epoch": 1.0,
+      "eval_accuracy": 0.19596367305751766,
+      "eval_loss": 4.119868278503418,
+      "eval_runtime": 539.7217,
+      "eval_samples_per_second": 27.542,
+      "eval_steps_per_second": 27.542,
       "step": 523
     },
     {
       "epoch": 1.0325047801147227,
+      "grad_norm": 1.7894033193588257,
       "learning_rate": 0.0009963883577650309,
+      "loss": 4.5805,
       "step": 540
     },
     {
       "epoch": 1.0707456978967496,
+      "grad_norm": 1.9136412143707275,
       "learning_rate": 0.0009921393669003612,
+      "loss": 4.5223,
       "step": 560
     },
     {
       "epoch": 1.1089866156787762,
+      "grad_norm": 1.8342599868774414,
       "learning_rate": 0.0009878903760356915,
+      "loss": 4.4213,
       "step": 580
     },
     {
       "epoch": 1.147227533460803,
+      "grad_norm": 1.7360094785690308,
       "learning_rate": 0.0009836413851710218,
+      "loss": 4.409,
       "step": 600
     },
     {
       "epoch": 1.1854684512428297,
+      "grad_norm": 1.8302013874053955,
       "learning_rate": 0.0009793923943063523,
+      "loss": 4.3382,
       "step": 620
     },
     {
       "epoch": 1.2237093690248566,
+      "grad_norm": 1.847433090209961,
       "learning_rate": 0.0009751434034416827,
+      "loss": 4.2381,
       "step": 640
     },
     {
       "epoch": 1.2619502868068833,
+      "grad_norm": 1.866734504699707,
       "learning_rate": 0.000970894412577013,
+      "loss": 4.1802,
       "step": 660
     },
     {
       "epoch": 1.3001912045889101,
+      "grad_norm": 1.9123674631118774,
       "learning_rate": 0.0009666454217123433,
+      "loss": 4.1404,
       "step": 680
     },
     {
       "epoch": 1.338432122370937,
+      "grad_norm": 1.8355252742767334,
       "learning_rate": 0.0009623964308476737,
+      "loss": 4.059,
       "step": 700
     },
     {
       "epoch": 1.3766730401529637,
+      "grad_norm": 1.7890186309814453,
       "learning_rate": 0.000958147439983004,
+      "loss": 4.0086,
       "step": 720
     },
     {
       "epoch": 1.4149139579349903,
+      "grad_norm": 1.847299337387085,
       "learning_rate": 0.0009538984491183344,
+      "loss": 3.9641,
       "step": 740
     },
     {
       "epoch": 1.4531548757170172,
+      "grad_norm": 1.8219211101531982,
       "learning_rate": 0.0009496494582536647,
+      "loss": 3.9291,
       "step": 760
     },
     {
       "epoch": 1.491395793499044,
+      "grad_norm": 1.8026444911956787,
       "learning_rate": 0.0009454004673889951,
+      "loss": 3.8577,
       "step": 780
     },
     {
       "epoch": 1.5296367112810707,
+      "grad_norm": 1.7959771156311035,
       "learning_rate": 0.0009411514765243255,
+      "loss": 3.8234,
       "step": 800
     },
     {
       "epoch": 1.5678776290630974,
+      "grad_norm": 1.7638428211212158,
       "learning_rate": 0.0009369024856596558,
+      "loss": 3.7132,
       "step": 820
     },
     {
       "epoch": 1.6061185468451242,
+      "grad_norm": 1.8295230865478516,
       "learning_rate": 0.0009326534947949862,
+      "loss": 3.7057,
       "step": 840
     },
     {
       "epoch": 1.644359464627151,
+      "grad_norm": 1.8669532537460327,
       "learning_rate": 0.0009284045039303166,
+      "loss": 3.6656,
       "step": 860
     },
     {
       "epoch": 1.682600382409178,
+      "grad_norm": 1.8251906633377075,
       "learning_rate": 0.0009241555130656469,
+      "loss": 3.591,
       "step": 880
     },
     {
       "epoch": 1.7208413001912046,
+      "grad_norm": 1.825548529624939,
       "learning_rate": 0.0009199065222009773,
+      "loss": 3.5281,
       "step": 900
     },
     {
       "epoch": 1.7590822179732313,
+      "grad_norm": 1.7811344861984253,
       "learning_rate": 0.0009156575313363077,
+      "loss": 3.4841,
       "step": 920
     },
     {
       "epoch": 1.7973231357552581,
+      "grad_norm": 1.823803186416626,
       "learning_rate": 0.000911408540471638,
+      "loss": 3.4707,
       "step": 940
     },
     {
       "epoch": 1.835564053537285,
+      "grad_norm": 1.872054100036621,
       "learning_rate": 0.0009071595496069684,
+      "loss": 3.4532,
       "step": 960
     },
     {
       "epoch": 1.8738049713193117,
+      "grad_norm": 1.7651457786560059,
       "learning_rate": 0.0009029105587422988,
+      "loss": 3.4224,
       "step": 980
     },
     {
       "epoch": 1.9120458891013383,
+      "grad_norm": 1.7336605787277222,
       "learning_rate": 0.0008986615678776291,
+      "loss": 3.3199,
       "step": 1000
     },
     {
       "epoch": 1.9502868068833652,
+      "grad_norm": 1.8058604001998901,
       "learning_rate": 0.0008944125770129595,
+      "loss": 3.2814,
       "step": 1020
     },
     {
       "epoch": 1.988527724665392,
+      "grad_norm": 1.9400501251220703,
       "learning_rate": 0.0008901635861482899,
+      "loss": 3.2423,
       "step": 1040
     },
     {
       "epoch": 2.0,
+      "eval_accuracy": 0.5047426841574167,
+      "eval_loss": 2.282437801361084,
+      "eval_runtime": 710.9931,
+      "eval_samples_per_second": 20.907,
+      "eval_steps_per_second": 20.907,
       "step": 1046
     },
     {
       "epoch": 2.026768642447419,
+      "grad_norm": 1.7496325969696045,
       "learning_rate": 0.0008859145952836202,
+      "loss": 3.1858,
       "step": 1060
     },
     {
       "epoch": 2.0650095602294454,
+      "grad_norm": 1.7203223705291748,
       "learning_rate": 0.0008816656044189504,
+      "loss": 3.0482,
       "step": 1080
     },
     {
       "epoch": 2.1032504780114722,
+      "grad_norm": 1.6859164237976074,
       "learning_rate": 0.000877416613554281,
+      "loss": 3.0764,
       "step": 1100
     },
     {
       "epoch": 2.141491395793499,
+      "grad_norm": 1.887332558631897,
       "learning_rate": 0.0008731676226896112,
+      "loss": 2.9918,
       "step": 1120
     },
     {
       "epoch": 2.179732313575526,
+      "grad_norm": 1.7712619304656982,
       "learning_rate": 0.0008689186318249415,
+      "loss": 2.9791,
       "step": 1140
     },
     {
       "epoch": 2.2179732313575524,
+      "grad_norm": 1.8518322706222534,
       "learning_rate": 0.000864669640960272,
+      "loss": 2.9064,
       "step": 1160
     },
     {
       "epoch": 2.2562141491395793,
+      "grad_norm": 1.8636976480484009,
       "learning_rate": 0.0008604206500956023,
+      "loss": 2.9346,
       "step": 1180
     },
     {
       "epoch": 2.294455066921606,
+      "grad_norm": 1.8007034063339233,
       "learning_rate": 0.0008561716592309326,
+      "loss": 2.9154,
       "step": 1200
     },
     {
       "epoch": 2.332695984703633,
+      "grad_norm": 1.8480207920074463,
       "learning_rate": 0.000851922668366263,
+      "loss": 2.8311,
       "step": 1220
     },
     {
       "epoch": 2.3709369024856595,
+      "grad_norm": 1.8463302850723267,
       "learning_rate": 0.0008476736775015934,
+      "loss": 2.843,
       "step": 1240
     },
     {
       "epoch": 2.4091778202676863,
+      "grad_norm": 1.8563566207885742,
       "learning_rate": 0.0008434246866369237,
+      "loss": 2.8704,
       "step": 1260
     },
     {
       "epoch": 2.447418738049713,
+      "grad_norm": 1.8388174772262573,
       "learning_rate": 0.0008391756957722541,
+      "loss": 2.7794,
       "step": 1280
     },
     {
       "epoch": 2.48565965583174,
+      "grad_norm": 1.787711262702942,
       "learning_rate": 0.0008349267049075845,
+      "loss": 2.7776,
       "step": 1300
     },
     {
       "epoch": 2.5239005736137665,
+      "grad_norm": 1.6573237180709839,
       "learning_rate": 0.0008306777140429148,
+      "loss": 2.7288,
       "step": 1320
     },
     {
       "epoch": 2.5621414913957934,
+      "grad_norm": 1.8304985761642456,
       "learning_rate": 0.0008264287231782451,
+      "loss": 2.6598,
       "step": 1340
     },
     {
       "epoch": 2.6003824091778203,
+      "grad_norm": 1.769439458847046,
       "learning_rate": 0.0008221797323135756,
+      "loss": 2.6842,
       "step": 1360
     },
     {
       "epoch": 2.638623326959847,
+      "grad_norm": 1.7404167652130127,
       "learning_rate": 0.0008179307414489059,
+      "loss": 2.5974,
       "step": 1380
     },
     {
       "epoch": 2.676864244741874,
+      "grad_norm": 1.7064534425735474,
       "learning_rate": 0.0008136817505842362,
+      "loss": 2.6557,
       "step": 1400
     },
     {
       "epoch": 2.7151051625239004,
+      "grad_norm": 1.784652590751648,
       "learning_rate": 0.0008094327597195667,
+      "loss": 2.5701,
       "step": 1420
     },
     {
       "epoch": 2.7533460803059273,
+      "grad_norm": 1.730402946472168,
       "learning_rate": 0.000805183768854897,
+      "loss": 2.5879,
       "step": 1440
     },
     {
       "epoch": 2.791586998087954,
+      "grad_norm": 1.803881049156189,
       "learning_rate": 0.0008009347779902273,
+      "loss": 2.5413,
       "step": 1460
     },
     {
       "epoch": 2.8298279158699806,
+      "grad_norm": 1.7114533185958862,
       "learning_rate": 0.0007966857871255578,
+      "loss": 2.4823,
       "step": 1480
     },
     {
       "epoch": 2.8680688336520075,
+      "grad_norm": 1.7487016916275024,
       "learning_rate": 0.000792436796260888,
+      "loss": 2.4236,
       "step": 1500
     },
     {
       "epoch": 2.9063097514340344,
+      "grad_norm": 1.7806780338287354,
       "learning_rate": 0.0007881878053962183,
+      "loss": 2.4762,
       "step": 1520
     },
     {
       "epoch": 2.9445506692160612,
+      "grad_norm": 1.851486086845398,
       "learning_rate": 0.0007839388145315488,
+      "loss": 2.4221,
       "step": 1540
     },
     {
       "epoch": 2.982791586998088,
+      "grad_norm": 1.779451608657837,
       "learning_rate": 0.0007796898236668791,
+      "loss": 2.4164,
       "step": 1560
     },
     {
       "epoch": 3.0,
+      "eval_accuracy": 0.6816010763538514,
+      "eval_loss": 1.4862462282180786,
+      "eval_runtime": 715.5989,
+      "eval_samples_per_second": 20.773,
+      "eval_steps_per_second": 20.773,
       "step": 1569
     },
     {
       "epoch": 3.0210325047801145,
+      "grad_norm": 1.7617199420928955,
       "learning_rate": 0.0007754408328022094,
+      "loss": 2.3348,
       "step": 1580
     },
     {
       "epoch": 3.0592734225621414,
+      "grad_norm": 1.7850340604782104,
       "learning_rate": 0.0007711918419375399,
+      "loss": 2.3196,
       "step": 1600
     },
     {
       "epoch": 3.0975143403441683,
+      "grad_norm": 1.828715205192566,
       "learning_rate": 0.0007669428510728702,
+      "loss": 2.2991,
       "step": 1620
     },
     {
       "epoch": 3.135755258126195,
+      "grad_norm": 1.7825413942337036,
       "learning_rate": 0.0007626938602082005,
+      "loss": 2.2354,
       "step": 1640
     },
     {
       "epoch": 3.173996175908222,
+      "grad_norm": 1.8411946296691895,
       "learning_rate": 0.0007584448693435309,
+      "loss": 2.221,
       "step": 1660
     },
     {
       "epoch": 3.2122370936902485,
+      "grad_norm": 1.8236651420593262,
       "learning_rate": 0.0007541958784788613,
+      "loss": 2.1939,
       "step": 1680
     },
     {
       "epoch": 3.2504780114722753,
+      "grad_norm": 1.8275988101959229,
       "learning_rate": 0.0007499468876141916,
+      "loss": 2.211,
       "step": 1700
     },
     {
       "epoch": 3.288718929254302,
+      "grad_norm": 1.7743233442306519,
       "learning_rate": 0.000745697896749522,
+      "loss": 2.1454,
       "step": 1720
     },
     {
       "epoch": 3.3269598470363286,
+      "grad_norm": 1.7873393297195435,
       "learning_rate": 0.0007414489058848524,
+      "loss": 2.1258,
       "step": 1740
     },
     {
       "epoch": 3.3652007648183555,
+      "grad_norm": 1.8012022972106934,
       "learning_rate": 0.0007371999150201827,
+      "loss": 2.1101,
       "step": 1760
     },
     {
       "epoch": 3.4034416826003824,
+      "grad_norm": 1.8000600337982178,
       "learning_rate": 0.0007329509241555131,
+      "loss": 2.1246,
       "step": 1780
     },
     {
       "epoch": 3.4416826003824093,
+      "grad_norm": 1.7723950147628784,
       "learning_rate": 0.0007287019332908435,
+      "loss": 2.1244,
       "step": 1800
     },
     {
       "epoch": 3.479923518164436,
+      "grad_norm": 1.8095979690551758,
       "learning_rate": 0.0007244529424261738,
+      "loss": 2.1099,
       "step": 1820
     },
     {
       "epoch": 3.5181644359464626,
+      "grad_norm": 1.8022161722183228,
       "learning_rate": 0.0007202039515615042,
+      "loss": 2.0703,
       "step": 1840
     },
     {
       "epoch": 3.5564053537284894,
+      "grad_norm": 1.7775332927703857,
       "learning_rate": 0.0007159549606968346,
+      "loss": 2.0937,
       "step": 1860
     },
     {
       "epoch": 3.5946462715105163,
+      "grad_norm": 1.829291820526123,
       "learning_rate": 0.0007117059698321649,
+      "loss": 2.0305,
       "step": 1880
     },
     {
       "epoch": 3.632887189292543,
+      "grad_norm": 1.731218934059143,
       "learning_rate": 0.0007074569789674953,
+      "loss": 2.0528,
       "step": 1900
     },
     {
       "epoch": 3.67112810707457,
+      "grad_norm": 1.9170475006103516,
       "learning_rate": 0.0007032079881028257,
+      "loss": 2.0311,
       "step": 1920
     },
     {
       "epoch": 3.7093690248565965,
+      "grad_norm": 1.6934610605239868,
       "learning_rate": 0.0006989589972381559,
+      "loss": 2.006,
       "step": 1940
     },
     {
       "epoch": 3.7476099426386233,
+      "grad_norm": 1.792523741722107,
       "learning_rate": 0.0006947100063734863,
+      "loss": 1.9627,
       "step": 1960
     },
     {
       "epoch": 3.78585086042065,
+      "grad_norm": 1.7618036270141602,
       "learning_rate": 0.0006904610155088166,
+      "loss": 2.0141,
       "step": 1980
     },
     {
       "epoch": 3.8240917782026767,
+      "grad_norm": 1.7026081085205078,
       "learning_rate": 0.000686212024644147,
+      "loss": 1.9489,
       "step": 2000
     },
     {
       "epoch": 3.8623326959847035,
+      "grad_norm": 1.7117011547088623,
       "learning_rate": 0.0006819630337794774,
+      "loss": 1.965,
       "step": 2020
     },
     {
       "epoch": 3.9005736137667304,
+      "grad_norm": 1.7798806428909302,
       "learning_rate": 0.0006777140429148077,
+      "loss": 1.9188,
       "step": 2040
     },
     {
       "epoch": 3.9388145315487573,
+      "grad_norm": 1.7349345684051514,
       "learning_rate": 0.0006734650520501381,
+      "loss": 1.9044,
       "step": 2060
     },
     {
       "epoch": 3.977055449330784,
+      "grad_norm": 1.8268795013427734,
       "learning_rate": 0.0006692160611854685,
+      "loss": 1.8625,
       "step": 2080
     },
     {
       "epoch": 4.0,
+      "eval_accuracy": 0.7917255297679112,
+      "eval_loss": 0.9794349670410156,
+      "eval_runtime": 689.4289,
+      "eval_samples_per_second": 21.561,
+      "eval_steps_per_second": 21.561,
       "step": 2092
     },
     {
       "epoch": 4.015296367112811,
+      "grad_norm": 1.6992357969284058,
       "learning_rate": 0.0006649670703207988,
+      "loss": 1.8294,
       "step": 2100
     },
     {
       "epoch": 4.053537284894838,
+      "grad_norm": 1.6715435981750488,
       "learning_rate": 0.0006607180794561292,
+      "loss": 1.7572,
       "step": 2120
     },
     {
       "epoch": 4.091778202676864,
+      "grad_norm": 1.7837462425231934,
       "learning_rate": 0.0006564690885914596,
+      "loss": 1.7502,
       "step": 2140
     },
     {
       "epoch": 4.130019120458891,
+      "grad_norm": 1.7979024648666382,
       "learning_rate": 0.0006522200977267899,
+      "loss": 1.733,
       "step": 2160
     },
     {
       "epoch": 4.168260038240918,
+      "grad_norm": 1.6781351566314697,
       "learning_rate": 0.0006479711068621203,
+      "loss": 1.7437,
       "step": 2180
     },
     {
       "epoch": 4.2065009560229445,
+      "grad_norm": 1.7719937562942505,
       "learning_rate": 0.0006437221159974506,
+      "loss": 1.7702,
       "step": 2200
     },
     {
       "epoch": 4.244741873804971,
+      "grad_norm": 1.7680734395980835,
       "learning_rate": 0.000639473125132781,
+      "loss": 1.7546,
       "step": 2220
     },
     {
       "epoch": 4.282982791586998,
+      "grad_norm": 1.7028470039367676,
       "learning_rate": 0.0006352241342681113,
+      "loss": 1.6716,
       "step": 2240
     },
     {
       "epoch": 4.321223709369025,
+      "grad_norm": 1.764496922492981,
       "learning_rate": 0.0006309751434034417,
+      "loss": 1.7316,
       "step": 2260
     },
     {
       "epoch": 4.359464627151052,
+      "grad_norm": 1.7339156866073608,
       "learning_rate": 0.0006267261525387721,
+      "loss": 1.6865,
       "step": 2280
     },
     {
       "epoch": 4.397705544933078,
+      "grad_norm": 1.6657025814056396,
       "learning_rate": 0.0006224771616741024,
+      "loss": 1.6711,
       "step": 2300
     },
     {
       "epoch": 4.435946462715105,
+      "grad_norm": 1.8127187490463257,
       "learning_rate": 0.0006182281708094328,
+      "loss": 1.7128,
       "step": 2320
     },
     {
       "epoch": 4.474187380497132,
+      "grad_norm": 1.720580816268921,
       "learning_rate": 0.0006139791799447631,
+      "loss": 1.6676,
       "step": 2340
     },
     {
       "epoch": 4.512428298279159,
+      "grad_norm": 1.7044484615325928,
       "learning_rate": 0.0006097301890800934,
+      "loss": 1.6651,
       "step": 2360
     },
     {
       "epoch": 4.550669216061186,
+      "grad_norm": 1.824859857559204,
       "learning_rate": 0.0006054811982154238,
+      "loss": 1.6743,
       "step": 2380
     },
     {
       "epoch": 4.588910133843212,
+      "grad_norm": 1.847652554512024,
       "learning_rate": 0.0006012322073507542,
+      "loss": 1.6765,
       "step": 2400
     },
     {
       "epoch": 4.627151051625239,
+      "grad_norm": 1.6747491359710693,
       "learning_rate": 0.0005969832164860845,
+      "loss": 1.6257,
       "step": 2420
     },
     {
       "epoch": 4.665391969407266,
+      "grad_norm": 1.7143758535385132,
       "learning_rate": 0.0005927342256214149,
+      "loss": 1.6515,
       "step": 2440
     },
     {
       "epoch": 4.7036328871892925,
+      "grad_norm": 1.7966911792755127,
       "learning_rate": 0.0005884852347567453,
+      "loss": 1.6384,
       "step": 2460
     },
     {
       "epoch": 4.741873804971319,
+      "grad_norm": 1.6942733526229858,
       "learning_rate": 0.0005842362438920756,
+      "loss": 1.5939,
       "step": 2480
     },
     {
       "epoch": 4.780114722753346,
+      "grad_norm": 1.7530827522277832,
       "learning_rate": 0.000579987253027406,
+      "loss": 1.5908,
       "step": 2500
     },
     {
       "epoch": 4.818355640535373,
+      "grad_norm": 1.845311164855957,
       "learning_rate": 0.0005757382621627364,
+      "loss": 1.5586,
       "step": 2520
     },
     {
       "epoch": 4.8565965583174,
+      "grad_norm": 1.7648016214370728,
       "learning_rate": 0.0005714892712980667,
+      "loss": 1.5561,
       "step": 2540
     },
     {
       "epoch": 4.894837476099426,
+      "grad_norm": 1.6392900943756104,
       "learning_rate": 0.000567240280433397,
+      "loss": 1.5641,
       "step": 2560
     },
     {
       "epoch": 4.933078393881453,
+      "grad_norm": 1.6668881177902222,
       "learning_rate": 0.0005629912895687275,
+      "loss": 1.5443,
       "step": 2580
     },
     {
       "epoch": 4.97131931166348,
+      "grad_norm": 1.8424748182296753,
       "learning_rate": 0.0005587422987040578,
+      "loss": 1.5637,
       "step": 2600
     },
     {
       "epoch": 5.0,
+      "eval_accuracy": 0.8490413723511604,
+      "eval_loss": 0.7047534584999084,
+      "eval_runtime": 7835.8179,
+      "eval_samples_per_second": 1.897,
+      "eval_steps_per_second": 1.897,
       "step": 2615
     },
     {
       "epoch": 5.009560229445507,
+      "grad_norm": 1.8417068719863892,
       "learning_rate": 0.0005544933078393881,
+      "loss": 1.4799,
       "step": 2620
     },
     {
       "epoch": 5.047801147227533,
+      "grad_norm": 1.6466395854949951,
       "learning_rate": 0.0005502443169747186,
+      "loss": 1.4184,
       "step": 2640
     },
     {
       "epoch": 5.08604206500956,
+      "grad_norm": 1.7499247789382935,
       "learning_rate": 0.0005459953261100489,
+      "loss": 1.4507,
       "step": 2660
     },
     {
       "epoch": 5.124282982791587,
+      "grad_norm": 1.7968547344207764,
       "learning_rate": 0.0005417463352453792,
+      "loss": 1.405,
       "step": 2680
     },
     {
       "epoch": 5.162523900573614,
+      "grad_norm": 1.7950819730758667,
       "learning_rate": 0.0005374973443807097,
+      "loss": 1.4362,
       "step": 2700
     },
     {
       "epoch": 5.2007648183556405,
+      "grad_norm": 1.745133399963379,
       "learning_rate": 0.00053324835351604,
+      "loss": 1.4238,
       "step": 2720
     },
     {
       "epoch": 5.239005736137667,
+      "grad_norm": 1.7767413854599,
       "learning_rate": 0.0005289993626513702,
+      "loss": 1.4025,
       "step": 2740
     },
     {
       "epoch": 5.277246653919694,
+      "grad_norm": 1.7297043800354004,
       "learning_rate": 0.0005247503717867008,
+      "loss": 1.4144,
       "step": 2760
     },
     {
       "epoch": 5.315487571701721,
+      "grad_norm": 1.8174902200698853,
       "learning_rate": 0.000520501380922031,
+      "loss": 1.4071,
       "step": 2780
     },
     {
       "epoch": 5.353728489483748,
+      "grad_norm": 1.6889333724975586,
       "learning_rate": 0.0005162523900573613,
+      "loss": 1.4022,
       "step": 2800
     },
     {
       "epoch": 5.3919694072657744,
+      "grad_norm": 1.6331517696380615,
       "learning_rate": 0.0005120033991926918,
+      "loss": 1.3479,
       "step": 2820
     },
     {
       "epoch": 5.430210325047801,
+      "grad_norm": 1.8916860818862915,
       "learning_rate": 0.0005077544083280221,
+      "loss": 1.362,
       "step": 2840
     },
     {
       "epoch": 5.468451242829828,
+      "grad_norm": 1.706222653388977,
       "learning_rate": 0.0005035054174633524,
+      "loss": 1.3613,
       "step": 2860
     },
     {
       "epoch": 5.506692160611855,
+      "grad_norm": 1.6761025190353394,
       "learning_rate": 0.0004992564265986828,
+      "loss": 1.3883,
       "step": 2880
     },
     {
       "epoch": 5.544933078393882,
+      "grad_norm": 1.632095217704773,
       "learning_rate": 0.0004950074357340132,
+      "loss": 1.3432,
       "step": 2900
     },
     {
       "epoch": 5.583173996175908,
+      "grad_norm": 1.6419159173965454,
       "learning_rate": 0.0004907584448693436,
+      "loss": 1.3417,
       "step": 2920
     },
     {
       "epoch": 5.621414913957935,
+      "grad_norm": 1.8355722427368164,
       "learning_rate": 0.0004865094540046739,
+      "loss": 1.341,
       "step": 2940
     },
     {
       "epoch": 5.659655831739962,
+      "grad_norm": 1.6611793041229248,
       "learning_rate": 0.00048226046314000425,
+      "loss": 1.339,
       "step": 2960
     },
     {
       "epoch": 5.6978967495219885,
+      "grad_norm": 1.7696843147277832,
       "learning_rate": 0.0004780114722753346,
+      "loss": 1.3183,
       "step": 2980
     },
     {
       "epoch": 5.736137667304015,
+      "grad_norm": 1.6689785718917847,
       "learning_rate": 0.000473762481410665,
+      "loss": 1.3365,
       "step": 3000
     },
     {
       "epoch": 5.774378585086042,
+      "grad_norm": 1.6962292194366455,
       "learning_rate": 0.00046951349054599533,
+      "loss": 1.2966,
       "step": 3020
     },
     {
       "epoch": 5.812619502868069,
+      "grad_norm": 1.7446441650390625,
       "learning_rate": 0.0004652644996813257,
+      "loss": 1.274,
       "step": 3040
     },
     {
       "epoch": 5.850860420650095,
+      "grad_norm": 1.7083852291107178,
       "learning_rate": 0.0004610155088166561,
+      "loss": 1.3145,
       "step": 3060
     },
     {
       "epoch": 5.8891013384321225,
+      "grad_norm": 1.6674286127090454,
       "learning_rate": 0.0004567665179519864,
+      "loss": 1.2944,
       "step": 3080
     },
     {
       "epoch": 5.927342256214149,
+      "grad_norm": 1.5787798166275024,
       "learning_rate": 0.00045251752708731676,
+      "loss": 1.2772,
       "step": 3100
     },
     {
       "epoch": 5.965583173996176,
+      "grad_norm": 1.6089515686035156,
       "learning_rate": 0.0004482685362226471,
+      "loss": 1.265,
       "step": 3120
     },
     {
       "epoch": 6.0,
+      "eval_accuracy": 0.886242852337706,
+      "eval_loss": 0.5389042496681213,
+      "eval_runtime": 383.2829,
+      "eval_samples_per_second": 38.783,
+      "eval_steps_per_second": 38.783,
       "step": 3138
     },
     {
       "epoch": 6.003824091778203,
+      "grad_norm": 1.5683554410934448,
       "learning_rate": 0.0004440195453579775,
+      "loss": 1.2601,
       "step": 3140
     },
     {
       "epoch": 6.042065009560229,
+      "grad_norm": 1.6791157722473145,
       "learning_rate": 0.00043977055449330785,
+      "loss": 1.2067,
       "step": 3160
     },
     {
       "epoch": 6.080305927342256,
+      "grad_norm": 1.5930650234222412,
       "learning_rate": 0.0004355215636286382,
+      "loss": 1.1818,
       "step": 3180
     },
     {
       "epoch": 6.118546845124283,
+      "grad_norm": 1.7136551141738892,
       "learning_rate": 0.0004312725727639686,
+      "loss": 1.1871,
       "step": 3200
     },
     {
       "epoch": 6.15678776290631,
+      "grad_norm": 1.6400994062423706,
       "learning_rate": 0.0004270235818992989,
+      "loss": 1.1851,
       "step": 3220
     },
     {
       "epoch": 6.195028680688337,
+      "grad_norm": 1.722548246383667,
       "learning_rate": 0.0004227745910346293,
+      "loss": 1.1798,
       "step": 3240
     },
     {
       "epoch": 6.233269598470363,
+      "grad_norm": 1.600697636604309,
       "learning_rate": 0.0004185256001699597,
+      "loss": 1.1516,
       "step": 3260
     },
     {
       "epoch": 6.27151051625239,
+      "grad_norm": 1.6722103357315063,
       "learning_rate": 0.00041427660930528997,
+      "loss": 1.176,
       "step": 3280
     },
     {
       "epoch": 6.309751434034417,
+      "grad_norm": 1.5297291278839111,
       "learning_rate": 0.00041002761844062037,
+      "loss": 1.1625,
       "step": 3300
     },
     {
       "epoch": 6.347992351816444,
+      "grad_norm": 1.6687546968460083,
       "learning_rate": 0.00040577862757595076,
+      "loss": 1.1744,
       "step": 3320
     },
     {
       "epoch": 6.3862332695984705,
+      "grad_norm": 1.6758590936660767,
       "learning_rate": 0.00040152963671128105,
+      "loss": 1.1408,
       "step": 3340
     },
     {
       "epoch": 6.424474187380497,
+      "grad_norm": 1.7506797313690186,
       "learning_rate": 0.00039728064584661145,
+      "loss": 1.1458,
       "step": 3360
     },
     {
       "epoch": 6.462715105162524,
+      "grad_norm": 1.7690140008926392,
       "learning_rate": 0.0003930316549819418,
+      "loss": 1.1579,
       "step": 3380
     },
     {
       "epoch": 6.500956022944551,
+      "grad_norm": 1.7732901573181152,
       "learning_rate": 0.00038878266411727214,
+      "loss": 1.1444,
       "step": 3400
     },
     {
       "epoch": 6.539196940726577,
+      "grad_norm": 1.7551547288894653,
       "learning_rate": 0.00038453367325260254,
+      "loss": 1.1431,
       "step": 3420
     },
     {
       "epoch": 6.577437858508604,
+      "grad_norm": 1.6275290250778198,
       "learning_rate": 0.0003802846823879329,
+      "loss": 1.1293,
       "step": 3440
     },
     {
       "epoch": 6.615678776290631,
+      "grad_norm": 1.769103765487671,
       "learning_rate": 0.0003760356915232632,
+      "loss": 1.1418,
       "step": 3460
     },
     {
       "epoch": 6.653919694072657,
+      "grad_norm": 1.7487330436706543,
       "learning_rate": 0.0003717867006585936,
+      "loss": 1.1305,
       "step": 3480
     },
     {
       "epoch": 6.692160611854685,
+      "grad_norm": 1.698512315750122,
       "learning_rate": 0.0003675377097939239,
+      "loss": 1.0805,
       "step": 3500
     },
     {
       "epoch": 6.730401529636711,
+      "grad_norm": 1.6636496782302856,
       "learning_rate": 0.0003632887189292543,
+      "loss": 1.0907,
       "step": 3520
     },
     {
       "epoch": 6.768642447418738,
+      "grad_norm": 1.577497959136963,
       "learning_rate": 0.00035903972806458466,
+      "loss": 1.1399,
       "step": 3540
     },
     {
       "epoch": 6.806883365200765,
+      "grad_norm": 1.7101361751556396,
       "learning_rate": 0.000354790737199915,
+      "loss": 1.1206,
       "step": 3560
     },
     {
       "epoch": 6.845124282982791,
+      "grad_norm": 1.6473299264907837,
       "learning_rate": 0.0003505417463352454,
+      "loss": 1.103,
       "step": 3580
     },
     {
       "epoch": 6.8833652007648185,
+      "grad_norm": 1.6744282245635986,
       "learning_rate": 0.00034629275547057574,
+      "loss": 1.1088,
       "step": 3600
     },
     {
       "epoch": 6.921606118546845,
+      "grad_norm": 1.67130708694458,
       "learning_rate": 0.0003420437646059061,
+      "loss": 1.0857,
       "step": 3620
     },
     {
       "epoch": 6.959847036328872,
+      "grad_norm": 1.6932523250579834,
       "learning_rate": 0.0003377947737412365,
       "loss": 1.0912,
       "step": 3640
     },
     {
       "epoch": 6.998087954110899,
+      "grad_norm": 1.6580239534378052,
       "learning_rate": 0.00033354578287656683,
+      "loss": 1.0888,
       "step": 3660
     },
     {
       "epoch": 7.0,
+      "eval_accuracy": 0.9101244534140599,
+      "eval_loss": 0.4364229142665863,
+      "eval_runtime": 605.3795,
+      "eval_samples_per_second": 24.555,
+      "eval_steps_per_second": 24.555,
       "step": 3661
     },
     {
       "epoch": 7.036328871892925,
+      "grad_norm": 1.5949829816818237,
       "learning_rate": 0.0003292967920118972,
+      "loss": 1.0123,
       "step": 3680
     },
     {
       "epoch": 7.074569789674952,
+      "grad_norm": 1.8134639263153076,
       "learning_rate": 0.0003250478011472275,
+      "loss": 1.0552,
       "step": 3700
     },
     {
       "epoch": 7.112810707456979,
+      "grad_norm": 1.6394524574279785,
       "learning_rate": 0.0003207988102825579,
+      "loss": 1.0142,
       "step": 3720
     },
     {
       "epoch": 7.151051625239006,
+      "grad_norm": 1.6918762922286987,
       "learning_rate": 0.00031654981941788826,
+      "loss": 1.0096,
       "step": 3740
     },
     {
       "epoch": 7.189292543021033,
+      "grad_norm": 1.673691987991333,
       "learning_rate": 0.0003123008285532186,
+      "loss": 1.0203,
       "step": 3760
     },
     {
       "epoch": 7.227533460803059,
+      "grad_norm": 1.5526095628738403,
       "learning_rate": 0.000308051837688549,
+      "loss": 1.049,
       "step": 3780
     },
     {
       "epoch": 7.265774378585086,
+      "grad_norm": 1.638197660446167,
       "learning_rate": 0.00030380284682387935,
+      "loss": 1.0247,
       "step": 3800
     },
     {
       "epoch": 7.304015296367113,
+      "grad_norm": 1.6690630912780762,
       "learning_rate": 0.0002995538559592097,
+      "loss": 0.9841,
       "step": 3820
     },
     {
       "epoch": 7.342256214149139,
+      "grad_norm": 1.645591139793396,
       "learning_rate": 0.0002953048650945401,
+      "loss": 1.018,
       "step": 3840
     },
     {
       "epoch": 7.3804971319311665,
+      "grad_norm": 1.676079273223877,
       "learning_rate": 0.0002910558742298704,
+      "loss": 0.9818,
       "step": 3860
     },
     {
       "epoch": 7.418738049713193,
+      "grad_norm": 1.6065680980682373,
       "learning_rate": 0.0002868068833652008,
+      "loss": 0.9795,
       "step": 3880
     },
     {
       "epoch": 7.45697896749522,
+      "grad_norm": 1.683929443359375,
       "learning_rate": 0.0002825578925005312,
+      "loss": 0.9588,
       "step": 3900
     },
     {
       "epoch": 7.495219885277247,
+      "grad_norm": 1.6200690269470215,
       "learning_rate": 0.00027830890163586146,
+      "loss": 1.0081,
       "step": 3920
     },
     {
       "epoch": 7.533460803059273,
+      "grad_norm": 1.7147966623306274,
       "learning_rate": 0.00027405991077119186,
+      "loss": 0.9822,
       "step": 3940
     },
     {
       "epoch": 7.5717017208413,
+      "grad_norm": 1.7224268913269043,
       "learning_rate": 0.00026981091990652226,
+      "loss": 0.988,
       "step": 3960
     },
     {
       "epoch": 7.609942638623327,
+      "grad_norm": 1.7145981788635254,
       "learning_rate": 0.00026556192904185255,
+      "loss": 0.9562,
       "step": 3980
     },
     {
       "epoch": 7.648183556405353,
+      "grad_norm": 1.8020603656768799,
       "learning_rate": 0.00026131293817718295,
+      "loss": 0.9644,
       "step": 4000
     },
     {
       "epoch": 7.686424474187381,
+      "grad_norm": 1.7413355112075806,
       "learning_rate": 0.00025706394731251324,
+      "loss": 0.9648,
       "step": 4020
     },
     {
       "epoch": 7.724665391969407,
+      "grad_norm": 1.6813682317733765,
       "learning_rate": 0.00025281495644784364,
+      "loss": 0.9244,
       "step": 4040
     },
     {
       "epoch": 7.762906309751434,
+      "grad_norm": 1.747910737991333,
       "learning_rate": 0.00024856596558317403,
+      "loss": 0.9217,
       "step": 4060
     },
     {
       "epoch": 7.801147227533461,
+      "grad_norm": 1.6242161989212036,
       "learning_rate": 0.0002443169747185044,
+      "loss": 0.9628,
       "step": 4080
     },
     {
       "epoch": 7.839388145315487,
+      "grad_norm": 1.5416340827941895,
       "learning_rate": 0.00024006798385383472,
+      "loss": 0.979,
       "step": 4100
     },
     {
       "epoch": 7.8776290630975145,
+      "grad_norm": 1.7438323497772217,
       "learning_rate": 0.00023581899298916507,
+      "loss": 0.9528,
       "step": 4120
     },
     {
       "epoch": 7.915869980879541,
+      "grad_norm": 1.6303768157958984,
       "learning_rate": 0.00023157000212449544,
+      "loss": 0.91,
       "step": 4140
     },
     {
       "epoch": 7.954110898661568,
+      "grad_norm": 1.586729884147644,
       "learning_rate": 0.00022732101125982578,
+      "loss": 0.9346,
       "step": 4160
     },
     {
       "epoch": 7.992351816443595,
+      "grad_norm": 1.771173357963562,
       "learning_rate": 0.00022307202039515615,
+      "loss": 0.9296,
       "step": 4180
     },
     {
       "epoch": 8.0,
+      "eval_accuracy": 0.9264715775311133,
+      "eval_loss": 0.36169418692588806,
+      "eval_runtime": 57.1398,
+      "eval_samples_per_second": 260.152,
+      "eval_steps_per_second": 260.152,
       "step": 4184
     },
     {
       "epoch": 8.030592734225621,
+      "grad_norm": 1.5647226572036743,
       "learning_rate": 0.00021882302953048652,
+      "loss": 0.9005,
       "step": 4200
     },
     {
       "epoch": 8.068833652007648,
+      "grad_norm": 1.6461886167526245,
       "learning_rate": 0.00021457403866581687,
+      "loss": 0.8696,
       "step": 4220
     },
     {
       "epoch": 8.107074569789676,
+      "grad_norm": 1.5358296632766724,
       "learning_rate": 0.0002103250478011472,
+      "loss": 0.8782,
       "step": 4240
     },
     {
       "epoch": 8.145315487571702,
+      "grad_norm": 1.6051462888717651,
       "learning_rate": 0.0002060760569364776,
+      "loss": 0.8483,
       "step": 4260
     },
     {
       "epoch": 8.183556405353729,
+      "grad_norm": 1.7184685468673706,
       "learning_rate": 0.00020182706607180795,
+      "loss": 0.8817,
       "step": 4280
     },
     {
       "epoch": 8.221797323135755,
+      "grad_norm": 1.6134257316589355,
       "learning_rate": 0.0001975780752071383,
+      "loss": 0.8603,
       "step": 4300
     },
     {
       "epoch": 8.260038240917781,
+      "grad_norm": 1.5783709287643433,
       "learning_rate": 0.00019332908434246867,
+      "loss": 0.8654,
       "step": 4320
     },
     {
       "epoch": 8.29827915869981,
+      "grad_norm": 1.4778318405151367,
       "learning_rate": 0.00018908009347779904,
+      "loss": 0.8554,
       "step": 4340
     },
     {
       "epoch": 8.336520076481836,
+      "grad_norm": 1.8124628067016602,
       "learning_rate": 0.00018483110261312938,
+      "loss": 0.8974,
       "step": 4360
     },
     {
       "epoch": 8.374760994263863,
+      "grad_norm": 1.7594116926193237,
       "learning_rate": 0.00018058211174845973,
+      "loss": 0.8716,
       "step": 4380
     },
     {
       "epoch": 8.413001912045889,
+      "grad_norm": 1.7039234638214111,
       "learning_rate": 0.0001763331208837901,
+      "loss": 0.8525,
       "step": 4400
     },
     {
       "epoch": 8.451242829827915,
+      "grad_norm": 1.6325209140777588,
       "learning_rate": 0.00017208413001912047,
+      "loss": 0.8663,
       "step": 4420
     },
     {
       "epoch": 8.489483747609942,
+      "grad_norm": 1.6818372011184692,
       "learning_rate": 0.00016783513915445082,
+      "loss": 0.8629,
       "step": 4440
     },
     {
       "epoch": 8.52772466539197,
+      "grad_norm": 1.5809085369110107,
       "learning_rate": 0.00016358614828978119,
+      "loss": 0.8718,
       "step": 4460
     },
     {
       "epoch": 8.565965583173996,
+      "grad_norm": 1.5711621046066284,
       "learning_rate": 0.00015933715742511153,
+      "loss": 0.8527,
       "step": 4480
     },
     {
       "epoch": 8.604206500956023,
+      "grad_norm": 1.5462543964385986,
       "learning_rate": 0.0001550881665604419,
+      "loss": 0.8589,
       "step": 4500
     },
     {
       "epoch": 8.64244741873805,
+      "grad_norm": 1.6341811418533325,
       "learning_rate": 0.00015083917569577227,
+      "loss": 0.8649,
       "step": 4520
     },
     {
       "epoch": 8.680688336520076,
+      "grad_norm": 1.5172038078308105,
       "learning_rate": 0.00014659018483110262,
+      "loss": 0.8192,
       "step": 4540
     },
     {
       "epoch": 8.718929254302104,
+      "grad_norm": 1.4799879789352417,
       "learning_rate": 0.00014234119396643296,
+      "loss": 0.8436,
       "step": 4560
     },
     {
       "epoch": 8.75717017208413,
+      "grad_norm": 1.547850251197815,
       "learning_rate": 0.00013809220310176336,
+      "loss": 0.8292,
       "step": 4580
     },
     {
       "epoch": 8.795411089866157,
+      "grad_norm": 1.8095018863677979,
       "learning_rate": 0.0001338432122370937,
+      "loss": 0.8527,
       "step": 4600
     },
     {
       "epoch": 8.833652007648183,
+      "grad_norm": 1.5578687191009521,
       "learning_rate": 0.00012959422137242405,
+      "loss": 0.8358,
       "step": 4620
     },
     {
       "epoch": 8.87189292543021,
+      "grad_norm": 1.7008335590362549,
       "learning_rate": 0.0001253452305077544,
+      "loss": 0.822,
       "step": 4640
     },
     {
       "epoch": 8.910133843212238,
+      "grad_norm": 1.6717548370361328,
       "learning_rate": 0.00012109623964308478,
+      "loss": 0.8352,
       "step": 4660
     },
     {
       "epoch": 8.948374760994264,
+      "grad_norm": 1.729179859161377,
       "learning_rate": 0.00011684724877841513,
+      "loss": 0.826,
       "step": 4680
     },
     {
       "epoch": 8.98661567877629,
+      "grad_norm": 1.6358789205551147,
       "learning_rate": 0.00011259825791374549,
+      "loss": 0.8066,
       "step": 4700
     },
     {
       "epoch": 9.0,
       "eval_accuracy": 0.9352842246888665,
+      "eval_loss": 0.3206591010093689,
+      "eval_runtime": 381.3245,
+      "eval_samples_per_second": 38.983,
+      "eval_steps_per_second": 38.983,
       "step": 4707
     },
     {
       "epoch": 9.024856596558317,
+      "grad_norm": 1.5783698558807373,
       "learning_rate": 0.00010834926704907585,
+      "loss": 0.8019,
       "step": 4720
     },
     {
       "epoch": 9.063097514340344,
+      "grad_norm": 1.5668120384216309,
       "learning_rate": 0.0001041002761844062,
+      "loss": 0.7702,
       "step": 4740
     },
     {
       "epoch": 9.101338432122372,
+      "grad_norm": 1.624084234237671,
       "learning_rate": 9.985128531973658e-05,
+      "loss": 0.7934,
       "step": 4760
     },
     {
       "epoch": 9.139579349904398,
+      "grad_norm": 1.5410951375961304,
       "learning_rate": 9.560229445506692e-05,
+      "loss": 0.7863,
       "step": 4780
     },
     {
       "epoch": 9.177820267686425,
+      "grad_norm": 1.663845419883728,
       "learning_rate": 9.135330359039729e-05,
+      "loss": 0.7894,
       "step": 4800
     },
     {
       "epoch": 9.216061185468451,
+      "grad_norm": 1.5939579010009766,
       "learning_rate": 8.710431272572764e-05,
+      "loss": 0.739,
       "step": 4820
     },
     {
       "epoch": 9.254302103250478,
+      "grad_norm": 1.5545909404754639,
       "learning_rate": 8.2855321861058e-05,
+      "loss": 0.7808,
       "step": 4840
     },
     {
       "epoch": 9.292543021032504,
+      "grad_norm": 1.665999412536621,
       "learning_rate": 7.860633099638836e-05,
+      "loss": 0.7761,
       "step": 4860
     },
     {
       "epoch": 9.330783938814532,
+      "grad_norm": 1.6480567455291748,
       "learning_rate": 7.435734013171871e-05,
+      "loss": 0.7876,
       "step": 4880
     },
     {
       "epoch": 9.369024856596559,
+      "grad_norm": 1.5779589414596558,
       "learning_rate": 7.010834926704908e-05,
+      "loss": 0.767,
       "step": 4900
     },
     {
       "epoch": 9.407265774378585,
+      "grad_norm": 1.6985348463058472,
       "learning_rate": 6.585935840237942e-05,
+      "loss": 0.783,
       "step": 4920
     },
     {
       "epoch": 9.445506692160611,
+      "grad_norm": 1.5563093423843384,
       "learning_rate": 6.16103675377098e-05,
+      "loss": 0.756,
       "step": 4940
     },
     {
       "epoch": 9.483747609942638,
+      "grad_norm": 1.6173079013824463,
       "learning_rate": 5.736137667304015e-05,
+      "loss": 0.7682,
       "step": 4960
     },
     {
       "epoch": 9.521988527724666,
+      "grad_norm": 1.5880271196365356,
       "learning_rate": 5.311238580837052e-05,
+      "loss": 0.7632,
       "step": 4980
     },
     {
       "epoch": 9.560229445506693,
+      "grad_norm": 1.6329987049102783,
       "learning_rate": 4.8863394943700874e-05,
+      "loss": 0.7602,
       "step": 5000
     },
     {
       "epoch": 9.598470363288719,
+      "grad_norm": 1.442744255065918,
       "learning_rate": 4.461440407903123e-05,
+      "loss": 0.7595,
       "step": 5020
     },
     {
       "epoch": 9.636711281070745,
+      "grad_norm": 1.5359572172164917,
       "learning_rate": 4.036541321436159e-05,
+      "loss": 0.7621,
       "step": 5040
     },
     {
       "epoch": 9.674952198852772,
+      "grad_norm": 1.6465296745300293,
       "learning_rate": 3.6116422349691954e-05,
+      "loss": 0.7717,
       "step": 5060
     },
     {
       "epoch": 9.7131931166348,
+      "grad_norm": 1.6590745449066162,
       "learning_rate": 3.186743148502231e-05,
+      "loss": 0.73,
       "step": 5080
     },
     {
       "epoch": 9.751434034416826,
+      "grad_norm": 1.5176348686218262,
       "learning_rate": 2.7618440620352666e-05,
+      "loss": 0.7486,
       "step": 5100
     },
     {
       "epoch": 9.789674952198853,
+      "grad_norm": 1.7158029079437256,
       "learning_rate": 2.3369449755683023e-05,
+      "loss": 0.7766,
       "step": 5120
     },
     {
       "epoch": 9.82791586998088,
+      "grad_norm": 1.6565515995025635,
       "learning_rate": 1.9120458891013384e-05,
+      "loss": 0.7159,
       "step": 5140
     },
     {
       "epoch": 9.866156787762906,
+      "grad_norm": 1.4815343618392944,
       "learning_rate": 1.4871468026343743e-05,
+      "loss": 0.7304,
       "step": 5160
     },
     {
       "epoch": 9.904397705544934,
+      "grad_norm": 1.6041672229766846,
       "learning_rate": 1.0622477161674103e-05,
+      "loss": 0.744,
       "step": 5180
     },
     {
       "epoch": 9.94263862332696,
+      "grad_norm": 1.6224092245101929,
       "learning_rate": 6.373486297004461e-06,
+      "loss": 0.7562,
       "step": 5200
     },
     {
       "epoch": 9.980879541108987,
+      "grad_norm": 1.5392311811447144,
       "learning_rate": 2.1244954323348204e-06,
+      "loss": 0.7675,
       "step": 5220
     },
     {
       "epoch": 10.0,
+      "eval_accuracy": 0.9405314497140935,
+      "eval_loss": 0.29811325669288635,
+      "eval_runtime": 464.2204,
+      "eval_samples_per_second": 32.021,
+      "eval_steps_per_second": 32.021,
       "step": 5230
     },
     {
       "epoch": 10.0,
       "step": 5230,
       "total_flos": 1.96318398191328e+18,
+      "train_loss": 2.122584131525306,
+      "train_runtime": 50832.5393,
+      "train_samples_per_second": 26.317,
+      "train_steps_per_second": 0.103
     }
   ],
   "logging_steps": 20,